Пример #1
0
        public static unsafe void ReverseBits(this Span <int> span)
        {
            var intsReversed = 0;

            if (Avx2.IsSupported)
            {
                fixed(int *ptr = span)
                {
                    var vectorCount = span.Length / 8;

                    for (int i = 0; i < vectorCount; i++)
                    {
                        var vector  = Avx.LoadVector256((ptr + intsReversed));
                        var vector2 = Avx2.And(Avx2.And(vector, Vector256.Create(0xFF00FF)), Vector256.Create(-16711936));
                        vector =
                            Avx2.Add(
                                Avx2.Or(
                                    Avx2.ShiftRightLogical(vector, 8),
                                    Avx2.ShiftLeftLogical(vector, 24)
                                    ),
                                Avx2.Or(
                                    Avx2.ShiftLeftLogical(vector2, 8),
                                    Avx2.ShiftRightLogical(vector2, 24)
                                    )
                                );

                        Avx.Store(ptr + intsReversed, vector);
                        intsReversed += 8;
                    }
                }
            }

            for (int i = intsReversed; i < span.Length; i++)
            {
                span[i] = BinaryPrimitives.ReverseEndianness(span[i]);
            }

            fixed(void *ptr = span)
            {
                new Span <byte>(ptr, span.Length * 4).ReverseBits();
            }
        }
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleTernaryOpTest__MultiplySubtractNegatedSingle();

            fixed(Vector256 <Single> *pFld1 = &test._fld1)
            fixed(Vector256 <Single> *pFld2 = &test._fld2)
            fixed(Vector256 <Single> *pFld3 = &test._fld3)
            {
                var result = Fma.MultiplySubtractNegated(
                    Avx.LoadVector256((Single *)(pFld1)),
                    Avx.LoadVector256((Single *)(pFld2)),
                    Avx.LoadVector256((Single *)(pFld3))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
            }
        }
Пример #3
0
        public void RunClassLclFldScenario_Load()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load));

            var test = new SimpleTernaryOpTest__BlendVariableInt32();

            fixed(Vector256 <Int32> *pFld1 = &test._fld1)
            fixed(Vector256 <Int32> *pFld2 = &test._fld2)
            fixed(Vector256 <Int32> *pFld3 = &test._fld3)
            {
                var result = Avx2.BlendVariable(
                    Avx.LoadVector256((Int32 *)(pFld1)),
                    Avx.LoadVector256((Int32 *)(pFld2)),
                    Avx.LoadVector256((Int32 *)(pFld3))
                    );

                Unsafe.Write(_dataTable.outArrayPtr, result);
                ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr);
            }
        }
Пример #4
0
        public static unsafe float Sum_AVX_stackalloc(float[] array)
        {
            Vector256 <float> sum = Avx.SetZeroVector256 <float>();

            fixed(float *ptr = &array[0])
            {
                for (int i = 0; i < array.Length; i += 8)
                {
                    var current = Avx.LoadVector256(ptr + i);
                    sum = Avx.Add(current, sum);
                }
            }

            // store __m256 into float[8] and sum all values via code - will it be slower than Sum_AVX()?
            var result = stackalloc float[8];

            Avx.Store(result, sum);

            return(*result + *(result + 1) + *(result + 2) + *(result + 3)
                   + *(result + 4) + *(result + 5) + *(result + 6) + *(result + 7));
        }
Пример #5
0
        public static unsafe float Sum_AVX(float[] array)
        {
            Vector256 <float> sum = Avx.SetZeroVector256 <float>();

            fixed(float *ptr = &array[0])
            {
                for (int i = 0; i < array.Length; i += 8)
                {
                    var current = Avx.LoadVector256(ptr + i);
                    sum = Avx.Add(current, sum);
                }
            }

            // sum all values in __m256 (horizontal sum)
            var ha      = Avx.HorizontalAdd(sum, sum);
            var ha2     = Avx.HorizontalAdd(ha, ha);
            var lo      = Avx.ExtractVector128(ha2, 1);
            var resultV = Sse.Add(Avx.GetLowerHalf(ha2), lo);

            return(Sse.ConvertToSingle(resultV));
        }
Пример #6
0
        public unsafe float[] ProcessDataUnsafe()
        {
            float[] results = new float[inputData.Length];
            fixed(float *inputPtr = &inputData[0])
            {
                float *inCurrent = inputPtr;

                fixed(float *resultPtr = &results[0])
                {
                    float *resEnd     = resultPtr + results.Length;
                    float *resCurrent = resultPtr;

                    while (resCurrent < resEnd)
                    {
                        Avx.Store(resCurrent, Avx.Sqrt(Avx.LoadVector256(inCurrent)));
                        resCurrent += 8;
                        inCurrent  += 8;
                    }
                }
            }

            return(results);
        }
        public static bool SequenceEqual_Avx(float[] array1, float[] array2)
        {
            if (array1.Length != array2.Length)
            {
                return(false);
            }

            if (array1.Length == 0)
            {
                return(true);//SequenceEqual_Soft(array1, array2, 0);
            }
            int i = 0;

            fixed(float *ptr1 = &array1[0])
            fixed(float *ptr2 = &array2[0])
            {
                if (array1.Length < 8)
                {
                    return(SequenceEqual_Soft(ptr1, ptr2, 0, array1.Length));
                }

                for (; i <= array1.Length - 8; i += 8) //16 for AVX512
                {
                    var vec1 = Avx.LoadVector256(ptr1 + i);
                    var vec2 = Avx.LoadVector256(ptr2 + i);
                    var ce   = Avx.Compare(vec1, vec2, FloatComparisonMode.NotEqualOrderedNonSignaling);

                    if (!Avx.TestZ(ce, ce))
                    {
                        return(false);
                    }
                }

                return(SequenceEqual_Soft(ptr1, ptr2, i, array1.Length));
            }
        }
Пример #8
0
        public unsafe void Serialize(ref MessagePackWriter writer, double[]?value, MessagePackSerializerOptions options)
        {
            if (value == null)
            {
                writer.WriteNil();
                return;
            }

            var inputLength = value.Length;

            writer.WriteArrayHeader(inputLength);
            if (inputLength == 0)
            {
                return;
            }

            var outputLength = inputLength * 9;
            var destination  = writer.GetSpan(outputLength);

            fixed(byte *pDestination = &destination[0])
            {
                var outputIterator = pDestination;

                fixed(double *pSource = &value[0])
                {
                    var inputEnd      = pSource + inputLength;
                    var inputIterator = (ulong *)pSource;

                    if (Avx2.IsSupported)
                    {
                        const int ShiftCount = 2;
                        const int Stride     = 1 << ShiftCount;

                        if (inputLength < Stride << 1)
                        {
                            goto ProcessEach;
                        }

                        var vectorShuffle = Vector256.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
                        for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride)
                        {
                            // Fetch 4 doubles.
                            var current = Avx.LoadVector256((byte *)inputIterator);
                            // Reorder Little Endian bytes to Big Endian.
                            var answer = Avx2.Shuffle(current, vectorShuffle).AsUInt64();
                            // Write 4 Big-Endian doubles.
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(0);
                            outputIterator          += 8;
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(1);
                            outputIterator          += 8;
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(2);
                            outputIterator          += 8;
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(3);
                            outputIterator          += 8;
                        }
                    }
                    else if (Ssse3.IsSupported)
                    {
                        const int ShiftCount = 1;
                        const int Stride     = 1 << ShiftCount;

                        if (inputLength < Stride << 1)
                        {
                            goto ProcessEach;
                        }

                        var vectorShuffle = Vector128.Create((byte)7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8);
                        for (var vectorizedEnd = inputIterator + ((inputLength >> ShiftCount) << ShiftCount); inputIterator != vectorizedEnd; inputIterator += Stride)
                        {
                            var current          = Sse2.LoadVector128((byte *)inputIterator);
                            var answer           = Ssse3.Shuffle(current, vectorShuffle).AsUInt64();
                            *   outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(0);
                            outputIterator          += 8;
                            *outputIterator++ = MessagePackCode.Float64;
                            *(ulong *)outputIterator = answer.GetElement(1);
                            outputIterator          += 8;
                        }
                    }

ProcessEach:
                    while (inputIterator != inputEnd)
                    {
                        *   outputIterator++ = MessagePackCode.Float64;
                        var current          = *inputIterator++;
                        *   outputIterator++ = (byte)(current >> 56);
                        *   outputIterator++ = (byte)(current >> 48);
                        *   outputIterator++ = (byte)(current >> 40);
                        *   outputIterator++ = (byte)(current >> 32);
                        *   outputIterator++ = (byte)(current >> 24);
                        *   outputIterator++ = (byte)(current >> 16);
                        *   outputIterator++ = (byte)(current >> 8);
                        *   outputIterator++ = (byte)current;
                    }
                }
            }

            writer.Advance(outputLength);
        }
Пример #9
0
    private static unsafe double[] BilinearInterpol_AVX(
        double[] x,
        double[] A,
        double minXA,
        double maxXA,
        double[] B,
        double minXB,
        double maxXB,
        double weightB)
    {
        double[] z = new double[outputVectorSize];

        fixed(double *pX = &x[0], pA = &A[0], pB = &B[0], pZ = &z[0])
        {
            Vector256 <double> vWeightB = Vector256.Create(weightB);
            Vector256 <double> vWeightA = Vector256.Create(1 - weightB);

            Vector256 <double> vMinXA = Vector256.Create(minXA);
            Vector256 <double> vMaxXA = Vector256.Create(maxXA);
            Vector256 <double> vMinXB = Vector256.Create(minXB);
            Vector256 <double> vMaxXB = Vector256.Create(maxXB);

            double             deltaA  = (maxXA - minXA) / (double)(A.Length - 1);
            double             deltaB  = (maxXB - minXB) / (double)(B.Length - 1);
            Vector256 <double> vDeltaA = Vector256.Create(deltaA);
            Vector256 <double> vDeltaB = Vector256.Create(deltaB);

            double             invDeltaA  = 1.0 / deltaA;
            double             invDeltaB  = 1.0 / deltaB;
            Vector256 <double> vInvDeltaA = Vector256.Create(invDeltaA);
            Vector256 <double> vInvDeltaB = Vector256.Create(invDeltaB);

            Vector128 <int> ALengthMinusOne = Vector128.Create(A.Length - 1);
            Vector128 <int> BLengthMinusOne = Vector128.Create(B.Length - 1);
            Vector128 <int> One             = Vector128.Create(1);

            for (var i = 0; i < x.Length; i += Vector256 <double> .Count)
            {
                Vector256 <double> currentX = Avx.LoadVector256(pX + i);

                // Determine the largest a, such that A[i] = f(xA) and xA <= x[i].
                // This involves casting from double to int; here we use a Vector conversion.
                Vector256 <double> aDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXA), vInvDeltaA);
                Vector128 <int>    a       = Avx.ConvertToVector128Int32WithTruncation(aDouble);
                a = Sse41.Min(Sse41.Max(a, Vector128 <int> .Zero), ALengthMinusOne);
                Vector128 <int> aPlusOne = Sse41.Min(Sse2.Add(a, One), ALengthMinusOne);

                // Now, get the reference input, xA, for our index a.
                // This involves casting from  int to double.
                Vector256 <double> xA = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(a), vDeltaA), vMinXA);

                // Now, compute the lambda for our A reference point.
                Vector256 <double> currentXNormA = Avx.Max(vMinXA, Avx.Min(currentX, vMaxXA));
                Vector256 <double> lambdaA       = Avx.Multiply(Avx.Subtract(currentXNormA, xA), vInvDeltaA);

                // Now, we need to load up our reference points using Vector Gather operations.
                Vector256 <double> AVector        = Avx2.GatherVector256(pA, a, 8);
                Vector256 <double> AVectorPlusOne = Avx2.GatherVector256(pA, aPlusOne, 8);

                // Now, do the all of the above for our B reference point.
                Vector256 <double> bDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXB), vInvDeltaB);
                Vector128 <int>    b       = Avx.ConvertToVector128Int32WithTruncation(bDouble);
                b = Sse41.Min(Sse41.Max(b, Vector128 <int> .Zero), BLengthMinusOne);
                Vector128 <int> bPlusOne = Sse41.Min(Sse2.Add(b, One), BLengthMinusOne);

                Vector256 <double> xB            = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(b), vDeltaB), vMinXB);
                Vector256 <double> currentXNormB = Avx.Max(vMinXB, Avx.Min(currentX, vMaxXB));
                Vector256 <double> lambdaB       = Avx.Multiply(Avx.Subtract(currentXNormB, xB), vInvDeltaB);

                Vector256 <double> BVector        = Avx2.GatherVector256(pB, b, 8);
                Vector256 <double> BVectorPlusOne = Avx2.GatherVector256(pB, bPlusOne, 8);

                Vector256 <double> newZ = Avx.Add(Avx.Multiply(vWeightA, Avx.Add(AVector, Avx.Multiply(lambdaA, Avx.Subtract(AVectorPlusOne, AVector)))),
                                                  Avx.Multiply(vWeightB, Avx.Add(BVector, Avx.Multiply(lambdaB, Avx.Subtract(BVectorPlusOne, BVector)))));
                Avx.Store(pZ + i, newZ);
            }
        }

        return(z);
    }
        public unsafe override double[] Applay(double[] values, int halfWindow)
        {
            var windowSize = 2 * halfWindow + 1;
            var resultSize = values.Length - windowSize + 1;

            if (resultSize == 0)
            {
                return(null);
            }

            var a   = new double[resultSize];
            var sum = 0d;

            fixed(double *valueStart = values, aStart = a)
            {
                var valueCurrent       = valueStart;
                var valueEndwindowSize = valueCurrent + windowSize;

                while (valueCurrent < valueEndwindowSize)
                {
                    sum += *valueCurrent;
                    valueCurrent++;
                }

                var aCurrent     = aStart + 1;
                var aEnd         = aStart + resultSize;
                var aUnrolledEnd = aStart + (((resultSize - 1) >> 4) << 4);

                valueCurrent = valueStart;

                var valueWindowSize = valueStart + windowSize;
                var vWindowSize     = Vector256.Create((double)windowSize);

                var vCurrent = Vector256.Create(
                    (ulong)aCurrent,
                    (ulong)aCurrent + 4 * sizeof(double),
                    (ulong)aCurrent + 8 * sizeof(double),
                    (ulong)aCurrent + 12 * sizeof(double));

                var vValueCurrent = Vector256.Create(
                    (ulong)valueCurrent,
                    (ulong)valueCurrent + 4 * sizeof(double),
                    (ulong)valueCurrent + 8 * sizeof(double),
                    (ulong)valueCurrent + 12 * sizeof(double));


                var vValueWindowSize = Vector256.Create(
                    (ulong)valueWindowSize,
                    (ulong)valueWindowSize + 4 * sizeof(double),
                    (ulong)valueWindowSize + 8 * sizeof(double),
                    (ulong)valueWindowSize + 12 * sizeof(double));

                var vShiftIndex1 = Vector256.Create(16ul * sizeof(double));

                while (aCurrent < aUnrolledEnd)
                {
                    #region  1

                    Avx.Store(
                        aCurrent,
                        Avx.Divide(
                            Avx.Subtract(
                                Avx.LoadVector256((double *)vValueWindowSize.GetElement(0)),
                                Avx.LoadVector256((double *)vValueCurrent.GetElement(0))),
                            vWindowSize
                            )
                        );

                    #endregion

                    #region  2

                    Avx.Store(
                        (double *)vCurrent.GetElement(1),
                        Avx.Divide(
                            Avx.Subtract(
                                Avx.LoadVector256((double *)vValueWindowSize.GetElement(1)),
                                Avx.LoadVector256((double *)vValueCurrent.GetElement(1))),
                            vWindowSize
                            )
                        );

                    #endregion

                    #region  3

                    Avx.Store(
                        (double *)vCurrent.GetElement(2),
                        Avx.Divide(
                            Avx.Subtract(
                                Avx.LoadVector256((double *)vValueWindowSize.GetElement(2)),
                                Avx.LoadVector256((double *)vValueCurrent.GetElement(2))),
                            vWindowSize
                            )
                        );

                    #endregion

                    #region  4

                    Avx.Store(
                        (double *)vCurrent.GetElement(3),
                        Avx.Divide(
                            Avx.Subtract(
                                Avx.LoadVector256((double *)vValueWindowSize.GetElement(3)),
                                Avx.LoadVector256((double *)vValueCurrent.GetElement(3))),
                            vWindowSize
                            )
                        );

                    #endregion

                    vCurrent         = Avx.Add(vCurrent.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64();
                    vValueCurrent    = Avx.Add(vValueCurrent.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64();
                    vValueWindowSize = Avx.Add(vValueWindowSize.AsDouble(), vShiftIndex1.AsDouble()).AsUInt64();

                    aCurrent = (double *)vCurrent.GetElement(0);
                }

                valueWindowSize = (double *)vValueWindowSize.GetElement(0);
                valueCurrent    = (double *)vValueCurrent.GetElement(0);

                while (aCurrent < aEnd)
                {
                    *aCurrent = (*valueWindowSize - *valueCurrent) / windowSize;
                    aCurrent++;
                    valueCurrent++;
                    valueWindowSize++;
                }

                var aPrev = aStart;
                aCurrent = aStart + 1;
                aEnd     = aStart + resultSize;

                *aPrev = sum / windowSize;

                aUnrolledEnd = aStart + (((resultSize - 1) >> 2) << 2);

                vCurrent = Vector256.Create(
                    (ulong)aCurrent,
                    (ulong)aCurrent + sizeof(double),
                    (ulong)aCurrent + 2 * sizeof(double),
                    (ulong)aCurrent + 3 * sizeof(double));

                var vPrev = Vector256.Create(
                    (ulong)aPrev,
                    (ulong)aPrev + sizeof(double),
                    (ulong)aPrev + 2 * sizeof(double),
                    (ulong)aPrev + 3 * sizeof(double));

                var vShiftIndex = Vector256.Create(4ul * sizeof(double));

                while (aCurrent < aUnrolledEnd)
                {
                    #region  1

                    *aCurrent += *(double *)vPrev.GetElement(0);

                    #endregion

                    #region  2

                    *(double *)vCurrent.GetElement(1) += *(double *)vPrev.GetElement(1);

                    #endregion

                    #region  3

                    *(double *)vCurrent.GetElement(2) += *(double *)vPrev.GetElement(2);

                    #endregion

                    #region  4

                    *(double *)vCurrent.GetElement(3) += *(double *)vPrev.GetElement(3);

                    #endregion

                    vCurrent = Avx.Add(vCurrent.AsDouble(), vShiftIndex.AsDouble()).AsUInt64();
                    vPrev    = Avx.Add(vPrev.AsDouble(), vShiftIndex.AsDouble()).AsUInt64();

                    aCurrent = (double *)vCurrent.GetElement(0);
                }

                aPrev = (double *)vPrev.GetElement(0);

                while (aCurrent < aEnd)
                {
                    *aCurrent += *aPrev;
                    aCurrent++;
                    aPrev++;
                }
            }

            return(a);
        }
Пример #11
0
        public static unsafe String From(string value)
        {
            List <int> surrogates     = null;
            var        i              = 0;
            var        codepointIndex = 0;

            if (Avx2.IsSupported)
            {
                var mask                = (ushort)0xF800;
                var surrogateBits       = (ushort)0xD800;
                var maskVector          = Avx2.BroadcastScalarToVector256(&mask);
                var surrogateBitsVector = Avx2.BroadcastScalarToVector256(&surrogateBits);
                fixed(char *str = value)
                {
                    var step = Vector256 <ushort> .Count;

                    while (i + step <= value.Length)
                    {
                        var chars        = Avx.LoadVector256((ushort *)(str + i));
                        var masked       = Avx2.And(chars, maskVector);
                        var equality     = Avx2.CompareEqual(masked, surrogateBitsVector);
                        var equalityBits = Avx2.MoveMask(equality.As <ushort, byte>());
                        var surrogate    = equalityBits != 0;
                        if (!surrogate)
                        {
                            i += step;
                            codepointIndex += step;
                        }
                        else
                        {
                            var border = i + step;
                            while (i < border)
                            {
                                Parse();
                            }
                        }
                    }
                }
            }

            while (i < value.Length)
            {
                Parse();
            }

            return(surrogates is null
                ? new SurrogateFreeRegular(value)
                : new Regular(value, surrogates));

            void Parse()
            {
                if (char.IsHighSurrogate(value[i]))
                {
                    if (i + 1 >= value.Length)
                    {
                        throw new ArgumentException(
                                  "Value is malformed - it ends with a high surrogate.",
                                  nameof(value)
                                  );
                    }

                    if (!char.IsLowSurrogate(value[i + 1]))
                    {
                        throw new ArgumentException(
                                  $"Value is malformed - a high surrogate at [{i}] not followed by a low surrogate.",
                                  nameof(value)
                                  );
                    }

                    surrogates ??= new List <int>();
                    surrogates.Add(codepointIndex);
                    i += 2;
                }
                else if (char.IsLowSurrogate(value, i))
                {
                    throw new ArgumentException(
                              $"Value is malformed - a low surrogate at [{i}] not following a high surrogate.",
                              nameof(value)
                              );
                }
                else
                {
                    i++;
                }

                codepointIndex++;
            }
        }
Пример #12
0
    public static unsafe void SalsaCore128(byte rounds, uint *state, byte *source, byte *destination)
    {
        var t8 = *(state + 8);
        var t9 = *(state + 9);

        var s1 = Avx.LoadVector256(state + 8);         // 8 9 10 11 12 13 14 15

        if (++*(state + 8) == 0)
        {
            ++*(state + 9);
        }

        // 4 9 14 3
        var x0 = Vector256.Create(
            *(state + 4), t9, *(state + 14), *(state + 3),
            *(state + 4), *(state + 9), *(state + 14), *(state + 3));
        // 0 5 10 15
        var x1 = Vector256.Create(
            *(state + 0), *(state + 5), *(state + 10), *(state + 15),
            *(state + 0), *(state + 5), *(state + 10), *(state + 15));
        // 12 1 6 11
        var x2 = Vector256.Create(
            *(state + 12), *(state + 1), *(state + 6), *(state + 11),
            *(state + 12), *(state + 1), *(state + 6), *(state + 11));
        // 8 13 2 7
        var x3 = Vector256.Create(
            t8, *(state + 13), *(state + 2), *(state + 7),
            *(state + 8), *(state + 13), *(state + 2), *(state + 7)
            );

        for (var i = 0; i < rounds; i += 2)
        {
            QuarterRound(ref x0, ref x1, ref x2, ref x3);
            Shuffle(ref x0, ref x2, ref x3);

            QuarterRound(ref x0, ref x1, ref x2, ref x3);
            Shuffle(ref x0, ref x2, ref x3);
        }

        Shuffle(ref x0, ref x1, ref x2, ref x3);

        var s0 = Avx.LoadVector256(state);         // 0 1 2 3 4 5 6 7

        x0 = Avx2.Add(x0, s0);
        x1 = Avx2.Add(x1, s1);
        x2 = Avx2.Add(x2, s0);
        x3 = Avx2.Add(x3, Avx.LoadVector256(state + 8));

        var v0 = Avx2.Xor(x0.AsByte(), Avx.LoadVector256(source));
        var v1 = Avx2.Xor(x1.AsByte(), Avx.LoadVector256(source + 32));
        var v2 = Avx2.Xor(x2.AsByte(), Avx.LoadVector256(source + 64));
        var v3 = Avx2.Xor(x3.AsByte(), Avx.LoadVector256(source + 96));

        Avx.Store(destination, v0);
        Avx.Store(destination + 32, v1);
        Avx.Store(destination + 64, v2);
        Avx.Store(destination + 96, v3);

        if (++*(state + 8) == 0)
        {
            ++*(state + 9);
        }
    }
        public unsafe int[,] IntegrateUnsafeVectorBranched()
        {
            int w = _data.Width();
            int h = _data.Height();

            int[,] res = new int[h, w];

            Vector256 <int> shiftRight = RotateRight;

            fixed(byte *pSource = &_data[0, 0])
            fixed(int *pTarget = &res[0, 0])
            {
                var pSrc = pSource;
                var pTrg = pTarget;

                for (var i = 0; i < h; i++)
                {
                    var j = 0;

                    var p  = Vector256.CreateScalar(0);
                    var pr = Vector256.CreateScalar(0);
                    //handle vector part
                    for (; j + Vector256 <int> .Count <= w; j += Vector256 <int> .Count)
                    {
                        var t = Avx2.ConvertToVector256Int32(pSrc); //(int)*(pSrc)

                        var s = Aggregate(p, t);                    // this code block has to be
                        p = t;                                      // added to handle the in-line
                        t = Avx2.Add(t, s);                         // recursion: S[i]=a[i]+S[i-1]

                        if (j > 0)
                        {
                            t = Avx2.Add(t, pr);    // t += *(pTrg - 1);
                        }
                        if (i > 0)
                        {
                            t = Avx2.Add(t, Avx.LoadVector256(pTrg - w));
                            if (j > 0)
                            {
                                t = Avx2.Subtract(t, Avx.LoadVector256(pTrg - w - 8));
                            }
                        }

                        Avx.Store(pTrg, t);
                        pr    = t;
                        pSrc += Vector256 <int> .Count;
                        pTrg += Vector256 <int> .Count;
                    }


                    // handle the tail
                    var pr2 = (j == 0 ? 0 : pr.GetElement(Vector256 <int> .Count - 1)); //  Vector256.CreateScalar(0);
                    for (; j < w; j++)
                    {
                        var t = (int)*(pSrc);           // Avx2.ConvertToVector256Int32(pSrc);
                        if (j > 0)
                        {
                            t += pr2;                   //    t = Avx2.Add(t, pr);
                        }
                        if (i > 0)
                        {
                            t += *(pTrg - w);           //  Avx2.Add(t, Avx.LoadVector256(pTrg - w));
                            if (j > 0)
                            {
                                t -= *(pTrg - w - 1);   //  Avx2.Subtract(t, Avx.LoadVector256(pTrg - w - 8));
                            }
                        }

                        *pTrg = t;                      // Avx2.Store(pTrg, t);
                        pr2 = t;                        // pr = t
                        pSrc++;
                        pTrg++;
                    }
                }
            }
            return(res);
        }
Пример #14
0
        public static bool find_structural_bits(uint8_t *buf, size_t len, ParsedJson *pj)
        {
            if (len > pj->bytecapacity)
            {
                Console.WriteLine("Your ParsedJson object only supports documents up to " + pj->bytecapacity +
                                  " bytes but you are trying to process " + len + " bytes\n");
                return(false);
            }

            uint32_t *base_ptr = pj->structural_indexes;
            uint32_t  @base    = 0;

            const uint64_t even_bits = 0x5555555555555555UL;
            const uint64_t odd_bits  = ~even_bits;

            // for now, just work in 64-byte chunks
            // we have padded the input out to 64 byte multiple with the remainder being
            // zeros

            // persistent state across loop
            uint64_t prev_iter_ends_odd_backslash = 0UL; // either 0 or 1, but a 64-bit value
            uint64_t prev_iter_inside_quote       = 0UL; // either all zeros or all ones

            // effectively the very first char is considered to follow "whitespace" for the
            // purposes of psuedo-structural character detection
            uint64_t prev_iter_ends_pseudo_pred = 1UL;
            size_t   lenminus64  = len < 64 ? 0 : len - 64;
            size_t   idx         = 0;
            uint64_t structurals = 0;

            // C#: assign static readonly fields to locals before the loop
            Vector256 <byte> low_nibble_mask  = s_low_nibble_mask;
            Vector256 <byte> high_nibble_mask = s_high_nibble_mask;

            var structural_shufti_mask = Vector256.Create((byte)0x7);
            var whitespace_shufti_mask = Vector256.Create((byte)0x18);
            var slashVec       = Vector256.Create((bytechar)'\\').AsByte();
            var ffVec          = Vector128.Create((byte)0xFF).AsUInt64();
            var doubleQuoteVec = Vector256.Create((byte)'"');
            var zeroBVec       = Vector256.Create((byte)0);
            var vec7f          = Vector256.Create((byte)0x7f);

            for (; idx < lenminus64; idx += 64)
            {
                var input_lo = Avx.LoadVector256(buf + idx + 0);
                var input_hi = Avx.LoadVector256(buf + idx + 32);
                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 1: detect odd sequences of backslashes
                ////////////////////////////////////////////////////////////////////////////////////////////
                ///
                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts     = start_edges & even_start_mask;
                uint64_t odd_starts      = start_edges & ~even_start_mask;
                uint64_t even_carries    = bs_bits + even_starts;
                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                bool iter_ends_odd_backslash =
                    add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1UL : 0x0UL;
                uint64_t even_carry_ends    = even_carries & ~bs_bits;
                uint64_t odd_carry_ends     = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends           = even_start_odd_end | odd_start_even_end;

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 2: detect insides of quote pairs
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = Sse2.X64.ConvertToUInt64(Pclmulqdq.CarrylessMultiply(
                                                                   Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0));

                uint32_t cnt       = (uint32_t)hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                {
                    base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    @base += 8;
                }

                @base = next_base;

                quote_mask            ^= prev_iter_inside_quote;
                prev_iter_inside_quote =
                    (uint64_t)((int64_t)quote_mask >>
                               63);  // right shift of a signed value expected to be well-defined and standard compliant as of C++20, John Regher from Utah U. says this is fine code



                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                    Avx2.Shuffle(high_nibble_mask,
                                 Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                                          vec7f)));

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                    Avx2.Shuffle(high_nibble_mask,
                                 Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                                          vec7f)));
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0   = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1   = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));


                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred         = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);

                //Console.WriteLine($"Iter: {idx}, satur: {structurals}");

                //*(uint64_t *)(pj->structurals + idx / 8) = structurals;
            }

            ////////////////
            /// we use a giant copy-paste which is ugly.
            /// but otherwise the string needs to be properly padded or else we
            /// risk invalidating the UTF-8 checks.
            ////////////
            if (idx < len)
            {
                uint8_t *tmpbuf = stackalloc uint8_t[64];
                memset(tmpbuf, 0x20, 64);
                memcpy(tmpbuf, buf + idx, len - idx);
                Vector256 <byte> input_lo = Avx.LoadVector256(tmpbuf + 0);
                Vector256 <byte> input_hi = Avx.LoadVector256(tmpbuf + 32);
                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 1: detect odd sequences of backslashes
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t bs_bits =
                    cmp_mask_against_input(input_lo, input_hi, slashVec);
                uint64_t start_edges = bs_bits & ~(bs_bits << 1);
                // flip lowest if we have an odd-length run at the end of the prior
                // iteration
                uint64_t even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
                uint64_t even_starts     = start_edges & even_start_mask;
                uint64_t odd_starts      = start_edges & ~even_start_mask;
                uint64_t even_carries    = bs_bits + even_starts;

                uint64_t odd_carries;
                // must record the carry-out of our odd-carries out of bit 63; this
                // indicates whether the sense of any edge going to the next iteration
                // should be flipped
                //bool iter_ends_odd_backslash =
                add_overflow(bs_bits, odd_starts, &odd_carries);

                odd_carries |=
                    prev_iter_ends_odd_backslash; // push in bit zero as a potential end
                // if we had an odd-numbered run at the
                // end of the previous iteration
                //prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
                uint64_t even_carry_ends    = even_carries & ~bs_bits;
                uint64_t odd_carry_ends     = odd_carries & ~bs_bits;
                uint64_t even_start_odd_end = even_carry_ends & odd_bits;
                uint64_t odd_start_even_end = odd_carry_ends & even_bits;
                uint64_t odd_ends           = even_start_odd_end | odd_start_even_end;

                ////////////////////////////////////////////////////////////////////////////////////////////
                //     Step 2: detect insides of quote pairs
                ////////////////////////////////////////////////////////////////////////////////////////////

                uint64_t quote_bits =
                    cmp_mask_against_input(input_lo, input_hi, doubleQuoteVec);
                quote_bits = quote_bits & ~odd_ends;
                uint64_t quote_mask = (uint64_t)Sse2.X64.ConvertToInt64(Pclmulqdq.CarrylessMultiply(
                                                                            Vector128.Create(quote_bits, 0UL /*C# reversed*/), ffVec, 0).AsInt64());
                quote_mask ^= prev_iter_inside_quote;

                //BUG? https://github.com/dotnet/coreclr/issues/22813
                //quote_mask = 60;
                //prev_iter_inside_quote = (uint64_t)((int64_t)quote_mask >> 63); // right shift of a signed value expected to be well-defined and standard compliant as of C++20

                uint32_t cnt       = (uint32_t)hamming(structurals);
                uint32_t next_base = @base + cnt;
                while (structurals != 0)
                {
                    base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                    structurals         = structurals & (structurals - 1);
                    @base += 8;
                }
                @base = next_base;
                // How do we build up a user traversable data structure
                // first, do a 'shufti' to detect structural JSON characters
                // they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
                // these go into the first 3 buckets of the comparison (1/2/4)

                // we are also interested in the four whitespace characters
                // space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
                // these go into the next 2 buckets of the comparison (8/16)

                var v_lo = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_lo),
                    Avx2.Shuffle(high_nibble_mask,
                                 Avx2.And(Avx2.ShiftRightLogical(input_lo.AsUInt32(), 4).AsByte(),
                                          vec7f)));

                var v_hi = Avx2.And(
                    Avx2.Shuffle(low_nibble_mask, input_hi),
                    Avx2.Shuffle(high_nibble_mask,
                                 Avx2.And(Avx2.ShiftRightLogical(input_hi.AsUInt32(), 4).AsByte(),
                                          vec7f)));
                var tmp_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, structural_shufti_mask), zeroBVec);
                var tmp_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, structural_shufti_mask), zeroBVec);

                uint64_t structural_res_0 = (uint32_t)Avx2.MoveMask(tmp_lo);
                uint64_t structural_res_1 = (uint64_t)Avx2.MoveMask(tmp_hi);
                structurals = ~(structural_res_0 | (structural_res_1 << 32));

                // this additional mask and transfer is non-trivially expensive,
                // unfortunately
                var tmp_ws_lo = Avx2.CompareEqual(
                    Avx2.And(v_lo, whitespace_shufti_mask), zeroBVec);
                var tmp_ws_hi = Avx2.CompareEqual(
                    Avx2.And(v_hi, whitespace_shufti_mask), zeroBVec);

                uint64_t ws_res_0   = (uint32_t)Avx2.MoveMask(tmp_ws_lo);
                uint64_t ws_res_1   = (uint64_t)Avx2.MoveMask(tmp_ws_hi);
                uint64_t whitespace = ~(ws_res_0 | (ws_res_1 << 32));


                // mask off anything inside quotes
                structurals &= ~quote_mask;

                // add the real quote bits back into our bitmask as well, so we can
                // quickly traverse the strings we've spent all this trouble gathering
                structurals |= quote_bits;

                // Now, establish "pseudo-structural characters". These are non-whitespace
                // characters that are (a) outside quotes and (b) have a predecessor that's
                // either whitespace or a structural character. This means that subsequent
                // passes will get a chance to encounter the first character of every string
                // of non-whitespace and, if we're parsing an atom like true/false/null or a
                // number we can stop at the first whitespace or structural character
                // following it.

                // a qualified predecessor is something that can happen 1 position before an
                // psuedo-structural character
                uint64_t pseudo_pred         = structurals | whitespace;
                uint64_t shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
                prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
                uint64_t pseudo_structurals =
                    shifted_pseudo_pred & (~whitespace) & (~quote_mask);
                structurals |= pseudo_structurals;

                // now, we've used our close quotes all we need to. So let's switch them off
                // they will be off in the quote mask and on in quote bits.
                structurals &= ~(quote_bits & ~quote_mask);
                //*(uint64_t *)(pj->structurals + idx / 8) = structurals;
                idx += 64;
            }
            uint32_t cnt2       = (uint32_t)hamming(structurals);
            uint32_t next_base2 = @base + cnt2;

            while (structurals != 0)
            {
                base_ptr[@base + 0] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals         = structurals & (structurals - 1);
                base_ptr[@base + 1] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals         = structurals & (structurals - 1);
                base_ptr[@base + 2] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals         = structurals & (structurals - 1);
                base_ptr[@base + 3] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals         = structurals & (structurals - 1);
                base_ptr[@base + 4] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals         = structurals & (structurals - 1);
                base_ptr[@base + 5] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals         = structurals & (structurals - 1);
                base_ptr[@base + 6] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals         = structurals & (structurals - 1);
                base_ptr[@base + 7] = (uint32_t)idx - 64 + (uint32_t)trailingzeroes(structurals);
                structurals         = structurals & (structurals - 1);
                @base += 8;
            }
            @base = next_base2;

            pj->n_structural_indexes = @base;
            if (base_ptr[pj->n_structural_indexes - 1] > len)
            {
                throw new InvalidOperationException("Internal bug");
            }
            if (len != base_ptr[pj->n_structural_indexes - 1])
            {
                // the string might not be NULL terminated, but we add a virtual NULL ending character.
                base_ptr[pj->n_structural_indexes++] = (uint32_t)len;
            }
            base_ptr[pj->n_structural_indexes] = 0; // make it safe to dereference one beyond this array

            return(true);
        }
Пример #15
0
        static unsafe int Main(string[] args)
        {
            int testResult = Pass;

            if (Avx2.IsSupported)
            {
                Four    = 4;
                Eight   = 8;
                invalid = 15;

                for (int i = 0; i < N; i++)
                {
                    floatSourceTable[i]  = (float)i * 10.0f;
                    doubleSourceTable[i] = (double)i * 10.0;
                    intSourceTable[i]    = i * 10;
                    longSourceTable[i]   = i * 10;
                }

                Vector256 <int>  indexi;
                Vector256 <long> indexl;
                Vector128 <int>  indexi128;

                fixed(int *iptr = intIndexTable)
                fixed(long *lptr   = longIndexTable)
                fixed(int *i128ptr = vector128intIndexTable)
                {
                    indexi    = Avx.LoadVector256(iptr);
                    indexl    = Avx.LoadVector256(lptr);
                    indexi128 = Sse2.LoadVector128(i128ptr);
                }

                Vector256 <int>    maski;
                Vector256 <uint>   maskui;
                Vector256 <long>   maskl;
                Vector256 <ulong>  maskul;
                Vector256 <float>  maskf;
                Vector256 <double> maskd;

                fixed(int *iptr = intMaskTable)
                fixed(long *lptr = longMaskTable)
                {
                    maski = Avx.LoadVector256(iptr);
                    maskl = Avx.LoadVector256(lptr);

                    maskui = maski.AsUInt32();
                    maskul = maskl.AsUInt64();
                    maskf  = maski.AsSingle();
                    maskd  = maskl.AsDouble();
                }

                Vector256 <int>    sourcei  = Vector256 <int> .Zero;
                Vector256 <uint>   sourceui = Vector256 <uint> .Zero;
                Vector256 <long>   sourcel  = Vector256 <long> .Zero;
                Vector256 <ulong>  sourceul = Vector256 <ulong> .Zero;
                Vector256 <float>  sourcef  = Vector256 <float> .Zero;
                Vector256 <double> sourced  = Vector256 <double> .Zero;

                // public static unsafe Vector256<float> GatherMaskVector256(Vector256<float> source, float* baseAddress, Vector256<int> index, Vector256<float> mask, byte scale)
                using (TestTable <float, int> floatTable = new TestTable <float, int>(floatSourceTable, new float[8]))
                {
                    var vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, 4);
                    Unsafe.Write(floatTable.outArrayPtr, vf);

                    if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf = (Vector256 <float>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <float>), typeof(float *), typeof(Vector256 <int>), typeof(Vector256 <float>), typeof(byte) }).
                         Invoke(null, new object[] { sourcef, Pointer.Box(floatTable.inArrayPtr, typeof(float *)), indexi, maskf, (byte)4 });
                    Unsafe.Write(floatTable.outArrayPtr, vf);

                    if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on float with invalid scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, Four);
                    Unsafe.Write(floatTable.outArrayPtr, vf);

                    if (!floatTable.CheckResult((x, y) => BitConverter.SingleToInt32Bits(x) == BitConverter.SingleToInt32Bits(y), intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on float with non-const scale (IMM):");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourcef, (float *)(floatTable.inArrayPtr), indexi, maskf, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on float with invalid non-const scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }

                // public static unsafe Vector256<double> GatherMaskVector256(Vector256<double> source, double* baseAddress, Vector128<int> index, Vector256<double> mask, byte scale)
                using (TestTable <double, int> doubletTable = new TestTable <double, int>(doubleSourceTable, new double[4]))
                {
                    var vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, 8);
                    Unsafe.Write(doubletTable.outArrayPtr, vd);

                    if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on double:");
                        foreach (var item in doubletTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vd = (Vector256 <double>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <double>), typeof(double *), typeof(Vector128 <int>), typeof(Vector256 <double>), typeof(byte) }).
                         Invoke(null, new object[] { sourced, Pointer.Box(doubletTable.inArrayPtr, typeof(double *)), indexi128, maskd, (byte)8 });
                    Unsafe.Write(doubletTable.outArrayPtr, vd);

                    if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on double:");
                        foreach (var item in doubletTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, Eight);
                    Unsafe.Write(doubletTable.outArrayPtr, vd);

                    if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on double with non-const scale (IMM):");
                        foreach (var item in doubletTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexi128, maskd, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid non-const scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }

                // public static unsafe Vector256<int> GatherMaskVector256(Vector256<int> source, int* baseAddress, Vector256<int> index, Vector256<int> mask, byte scale)
                using (TestTable <int, int> intTable = new TestTable <int, int>(intSourceTable, new int[8]))
                {
                    var vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, 4);
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y, intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on int:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf = (Vector256 <int>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <int>), typeof(int *), typeof(Vector256 <int>), typeof(Vector256 <int>), typeof(byte) }).
                         Invoke(null, new object[] { sourcei, Pointer.Box(intTable.inArrayPtr, typeof(int *)), indexi, maski, (byte)4 });
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y, intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on int:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on int with invalid scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, Four);
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y, intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on int with non-const scale (IMM):");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourcei, (int *)(intTable.inArrayPtr), indexi, maski, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on int with invalid non-const scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }

                // public static unsafe Vector256<uint> GatherMaskVector256(Vector256<uint> source, uint* baseAddress, Vector256<int> index, Vector256<uint> mask, byte scale)
                using (TestTable <int, int> intTable = new TestTable <int, int>(intSourceTable, new int[8]))
                {
                    var vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, 4);
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y, intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on uint:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf = (Vector256 <uint>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <uint>), typeof(uint *), typeof(Vector256 <int>), typeof(Vector256 <uint>), typeof(byte) }).
                         Invoke(null, new object[] { sourceui, Pointer.Box(intTable.inArrayPtr, typeof(uint *)), indexi, maskui, (byte)4 });
                    if (!intTable.CheckResult((x, y) => x == y, intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on uint:");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with invalid scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, Four);
                    Unsafe.Write(intTable.outArrayPtr, vf);

                    if (!intTable.CheckResult((x, y) => x == y, intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with non-const scale (IMM):");
                        foreach (var item in intTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourceui, (uint *)(intTable.inArrayPtr), indexi, maskui, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on uint with invalid non-const scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }

                // public static unsafe Vector256<long> GatherMaskVector256(Vector256<long> source, long* baseAddress, Vector128<int> index, Vector256<long> mask, byte scale)
                using (TestTable <long, int> longTable = new TestTable <long, int>(longSourceTable, new long[4]))
                {
                    var vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, 8);
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf = (Vector256 <long>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <long>), typeof(long *), typeof(Vector128 <int>), typeof(Vector256 <long>), typeof(byte) }).
                         Invoke(null, new object[] { sourcel, Pointer.Box(longTable.inArrayPtr, typeof(long *)), indexi128, maskl, (byte)8 });
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on long:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, Eight);
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long with non-const scale (IMM):");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexi128, maskl, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }

                // public static unsafe Vector256<ulong> GatherMaskVector256(Vector256<ulong> source, ulong* baseAddress, Vector128<int> index, Vector256<ulong> mask, byte scale)
                using (TestTable <long, int> longTable = new TestTable <long, int>(longSourceTable, new long[4]))
                {
                    var vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, 8);
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf = (Vector256 <ulong>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <ulong>), typeof(ulong *), typeof(Vector128 <int>), typeof(Vector256 <ulong>), typeof(byte) }).
                         Invoke(null, new object[] { sourceul, Pointer.Box(longTable.inArrayPtr, typeof(ulong *)), indexi128, maskul, (byte)8 });
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on ulong:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, Eight);
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, vector128intIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with non-const scale (IMM):");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexi128, maskul, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid non-const scale (IMM)");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }

                // public static unsafe Vector256<long> GatherMaskVector256(Vector256<long> source, long* baseAddress, Vector256<long> index, Vector256<long> mask, byte scale)
                using (TestTable <long, long> longTable = new TestTable <long, long>(longSourceTable, new long[4]))
                {
                    var vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, 8);
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long with Vector256 long index:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf = (Vector256 <long>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <long>), typeof(long *), typeof(Vector256 <long>), typeof(Vector256 <long>), typeof(byte) }).
                         Invoke(null, new object[] { sourcel, Pointer.Box(longTable.inArrayPtr, typeof(long *)), indexl, maskl, (byte)8 });
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on long with Vector256 long index:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid scale (IMM) and Vector256 long index");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, Eight);
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long with non-const scale (IMM) and Vector256 long index:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourcel, (long *)(longTable.inArrayPtr), indexl, maskl, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM) and Vector256 long index");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }

                // public static unsafe Vector256<ulong> GatherMaskVector256(Vector256<ulong> source, ulong* baseAddress, Vector256<long> index, Vector256<ulong> mask, byte scale)
                using (TestTable <long, long> longTable = new TestTable <long, long>(longSourceTable, new long[4]))
                {
                    var vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, 8);
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with Vector256 long index:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf = (Vector256 <ulong>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <ulong>), typeof(ulong *), typeof(Vector256 <long>), typeof(Vector256 <ulong>), typeof(byte) }).
                         Invoke(null, new object[] { sourceul, Pointer.Box(longTable.inArrayPtr, typeof(ulong *)), indexl, maskul, (byte)8 });
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on ulong with Vector256 long index:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with invalid scale (IMM) and Vector256 long index");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, Eight);
                    Unsafe.Write(longTable.outArrayPtr, vf);

                    if (!longTable.CheckResult((x, y) => x == y, longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on ulong with non-const scale (IMM) and Vector256 long index:");
                        foreach (var item in longTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vf = Avx2.GatherMaskVector256(sourceul, (ulong *)(longTable.inArrayPtr), indexl, maskul, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on long with invalid non-const scale (IMM) and Vector256 long index");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }

                // public static unsafe Vector256<double> GatherMaskVector256(Vector256<double> source, double* baseAddress, Vector256<long> index, Vector256<double> mask, byte scale)
                using (TestTable <double, long> doubletTable = new TestTable <double, long>(doubleSourceTable, new double[4]))
                {
                    var vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, 8);
                    Unsafe.Write(doubletTable.outArrayPtr, vd);

                    if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on double with Vector256 long index:");
                        foreach (var item in doubletTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vd = (Vector256 <double>) typeof(Avx2).GetMethod(nameof(Avx2.GatherMaskVector256), new Type[] { typeof(Vector256 <double>), typeof(double *), typeof(Vector256 <long>), typeof(Vector256 <double>), typeof(byte) }).
                         Invoke(null, new object[] { sourced, Pointer.Box(doubletTable.inArrayPtr, typeof(double *)), indexl, maskd, (byte)8 });
                    Unsafe.Write(doubletTable.outArrayPtr, vd);

                    if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed with reflection on double with Vector256 long index:");
                        foreach (var item in doubletTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, 3);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid scale (IMM) and Vector256 long index");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }

                    vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, Eight);
                    Unsafe.Write(doubletTable.outArrayPtr, vd);

                    if (!doubletTable.CheckResult((x, y) => BitConverter.DoubleToInt64Bits(x) == BitConverter.DoubleToInt64Bits(y), longIndexTable))
                    {
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on double with non-const scale (IMM) and Vector256 long index:");
                        foreach (var item in doubletTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    try
                    {
                        vd = Avx2.GatherMaskVector256(sourced, (double *)(doubletTable.inArrayPtr), indexl, maskd, invalid);
                        Console.WriteLine("AVX2 GatherMaskVector256 failed on double with invalid non-const scale (IMM) and Vector256 long index");
                        testResult = Fail;
                    }
                    catch (System.ArgumentOutOfRangeException)
                    {
                        // success
                    }
                }
            }



            return(testResult);
        }
Пример #16
0
        // This function implements Algorithm 2 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf
        // Calculate the stochastic gradient and update the model.
        public static unsafe void CalculateGradientAndUpdate(int *fieldIndices, int *featureIndices, float *featureValues, float *latentSum, float *linearWeights,
                                                             float *latentWeights, float *linearAccumulatedSquaredGrads, float *latentAccumulatedSquaredGrads, float lambdaLinear, float lambdaLatent, float learningRate,
                                                             int fieldCount, int latentDim, float weight, int count, float slope)
        {
            Contracts.Assert(Avx.IsSupported);

            int    m   = fieldCount;
            int    d   = latentDim;
            int    c   = count;
            int *  pf  = fieldIndices;
            int *  pi  = featureIndices;
            float *px  = featureValues;
            float *pq  = latentSum;
            float *pw  = linearWeights;
            float *pv  = latentWeights;
            float *phw = linearAccumulatedSquaredGrads;
            float *phv = latentAccumulatedSquaredGrads;

            Vector256 <float> wei     = Vector256.Create(weight);
            Vector256 <float> s       = Vector256.Create(slope);
            Vector256 <float> lr      = Vector256.Create(learningRate);
            Vector256 <float> lambdav = Vector256.Create(lambdaLatent);

            for (int i = 0; i < count; i++)
            {
                int f = pf[i];
                int j = pi[i];

                // Calculate gradient of linear term w_j.
                float g = weight * (lambdaLinear * pw[j] + slope * px[i]);

                // Accumulate the gradient of the linear term.
                phw[j] += g * g;

                // Perform ADAGRAD update rule to adjust linear term.
                pw[j] -= learningRate / MathF.Sqrt(phw[j]) * g;

                // Update latent term, v_j,f', f'=1,...,m.
                Vector256 <float> x = Avx.BroadcastScalarToVector256(px + i);

                for (int fprime = 0; fprime < m; fprime++)
                {
                    float *           vjfprime  = pv + j * m * d + fprime * d;
                    float *           hvjfprime = phv + j * m * d + fprime * d;
                    float *           qfprimef  = pq + fprime * m * d + f * d;
                    Vector256 <float> sx        = Avx.Multiply(s, x);

                    for (int k = 0; k + 8 <= d; k += 8)
                    {
                        Vector256 <float> v = Avx.LoadVector256(vjfprime + k);
                        Vector256 <float> q = Avx.LoadVector256(qfprimef + k);

                        // Calculate L2-norm regularization's gradient.
                        Vector256 <float> gLatent = Avx.Multiply(lambdav, v);

                        Vector256 <float> tmp = q;

                        // Calculate loss function's gradient.
                        if (fprime == f)
                        {
                            tmp = MultiplyAddNegated(v, x, q);
                        }
                        gLatent = MultiplyAdd(sx, tmp, gLatent);
                        gLatent = Avx.Multiply(wei, gLatent);

                        // Accumulate the gradient of latent vectors.
                        Vector256 <float> h = MultiplyAdd(gLatent, gLatent, Avx.LoadVector256(hvjfprime + k));

                        // Perform ADAGRAD update rule to adjust latent vector.
                        v = MultiplyAddNegated(lr, Avx.Multiply(Avx.ReciprocalSqrt(h), gLatent), v);
                        Avx.Store(vjfprime + k, v);
                        Avx.Store(hvjfprime + k, h);
                    }
                }
            }
        }
Пример #17
0
        // This function implements Algorithm 1 in https://github.com/wschin/fast-ffm/blob/master/fast-ffm.pdf.
        // Compute the output value of the field-aware factorization, as the sum of the linear part and the latent part.
        // The linear part is the inner product of linearWeights and featureValues.
        // The latent part is the sum of all intra-field interactions in one field f, for all fields possible
        public static unsafe void CalculateIntermediateVariables(int *fieldIndices, int *featureIndices, float *featureValues,
                                                                 float *linearWeights, float *latentWeights, float *latentSum, float *response, int fieldCount, int latentDim, int count)
        {
            Contracts.Assert(Avx.IsSupported);

            // The number of all possible fields.
            int    m              = fieldCount;
            int    d              = latentDim;
            int    c              = count;
            int *  pf             = fieldIndices;
            int *  pi             = featureIndices;
            float *px             = featureValues;
            float *pw             = linearWeights;
            float *pv             = latentWeights;
            float *pq             = latentSum;
            float  linearResponse = 0;
            float  latentResponse = 0;

            Unsafe.InitBlock(pq, 0, (uint)(m * m * d * sizeof(float)));

            Vector256 <float> y   = Vector256 <float> .Zero;
            Vector256 <float> tmp = Vector256 <float> .Zero;

            for (int i = 0; i < c; i++)
            {
                int f = pf[i];
                int j = pi[i];
                linearResponse += pw[j] * px[i];

                Vector256 <float> x  = Avx.BroadcastScalarToVector256(px + i);
                Vector256 <float> xx = Avx.Multiply(x, x);

                // tmp -= <v_j,f, v_j,f> * x * x
                int vBias = j * m * d + f * d;

                // j-th feature's latent vector in the f-th field hidden space.
                float *vjf = pv + vBias;

                for (int k = 0; k + 8 <= d; k += 8)
                {
                    Vector256 <float> vjfBuffer = Avx.LoadVector256(vjf + k);
                    tmp = MultiplyAddNegated(Avx.Multiply(vjfBuffer, vjfBuffer), xx, tmp);
                }

                for (int fprime = 0; fprime < m; fprime++)
                {
                    vBias = j * m * d + fprime * d;
                    int    qBias    = f * m * d + fprime * d;
                    float *vjfprime = pv + vBias;
                    float *qffprime = pq + qBias;

                    // q_f,f' += v_j,f' * x
                    for (int k = 0; k + 8 <= d; k += 8)
                    {
                        Vector256 <float> vjfprimeBuffer = Avx.LoadVector256(vjfprime + k);
                        Vector256 <float> q = Avx.LoadVector256(qffprime + k);
                        q = MultiplyAdd(vjfprimeBuffer, x, q);
                        Avx.Store(qffprime + k, q);
                    }
                }
            }

            for (int f = 0; f < m; f++)
            {
                // tmp += <q_f,f, q_f,f>
                float *qff = pq + f * m * d + f * d;
                for (int k = 0; k + 8 <= d; k += 8)
                {
                    Vector256 <float> qffBuffer = Avx.LoadVector256(qff + k);

                    // Intra-field interactions.
                    tmp = MultiplyAdd(qffBuffer, qffBuffer, tmp);
                }

                // y += <q_f,f', q_f',f>, f != f'
                // Whis loop handles inter - field interactions because f != f'.
                for (int fprime = f + 1; fprime < m; fprime++)
                {
                    float *qffprime = pq + f * m * d + fprime * d;
                    float *qfprimef = pq + fprime * m * d + f * d;
                    for (int k = 0; k + 8 <= d; k += 8)
                    {
                        // Inter-field interaction.
                        Vector256 <float> qffprimeBuffer = Avx.LoadVector256(qffprime + k);
                        Vector256 <float> qfprimefBuffer = Avx.LoadVector256(qfprimef + k);
                        y = MultiplyAdd(qffprimeBuffer, qfprimefBuffer, y);
                    }
                }
            }

            y   = MultiplyAdd(_point5, tmp, y);
            tmp = Avx.Add(y, Avx.Permute2x128(y, y, 1));
            tmp = Avx.HorizontalAdd(tmp, tmp);
            y   = Avx.HorizontalAdd(tmp, tmp);
            Sse.StoreScalar(&latentResponse, y.GetLower()); // The lowest slot is the response value.
            *response = linearResponse + latentResponse;
        }
Пример #18
0
        public unsafe void Vector256FloatMultipleOpsUnsafe()
        {
            fixed(float *d1Ptr = &data[0])
            {
                fixed(float *d2Ptr = &data2[0])
                {
                    fixed(float *d3Ptr = &data3[0])
                    {
                        fixed(float *resPtr = &result[0])
                        {
                            float *currD1   = d1Ptr;
                            float *currD2   = d2Ptr;
                            float *currD3   = d3Ptr;
                            float *currRes  = resPtr;
                            float *limitPtr = d1Ptr + numberOfFloatItems;

                            while (currD1 < limitPtr)
                            {
                                Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currD1), Avx.LoadVector256(currD2), Avx.LoadVector256(currD3)));
                                Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currRes), Avx.LoadVector256(currD1), Avx.LoadVector256(currD1)));
                                Avx.Store(currRes, Fma.MultiplyAdd(Avx.LoadVector256(currD1), Avx.LoadVector256(currD2), Avx.LoadVector256(currRes)));
                                currD1  += 8;
                                currD2  += 8;
                                currD3  += 8;
                                currRes += 8;
                            }
                        }
                    }
                }
            }
        }
Пример #19
0
        public static unsafe bool TryGetAsciiString(byte *input, char *output, int count)
        {
            Debug.Assert(input != null);
            Debug.Assert(output != null);

            var end = input + count;

            Debug.Assert((long)end >= Vector256 <sbyte> .Count);

            if (Sse2.IsSupported)
            {
                if (Avx2.IsSupported && input <= end - Vector256 <sbyte> .Count)
                {
                    Vector256 <sbyte> zero = Vector256 <sbyte> .Zero;

                    do
                    {
                        var vector = Avx.LoadVector256(input).AsSByte();
                        if (!CheckBytesInAsciiRange(vector, zero))
                        {
                            return(false);
                        }

                        var tmp0 = Avx2.UnpackLow(vector, zero);
                        var tmp1 = Avx2.UnpackHigh(vector, zero);

                        // Bring into the right order
                        var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20);
                        var out1 = Avx2.Permute2x128(tmp0, tmp1, 0x31);

                        Avx.Store((ushort *)output, out0.AsUInt16());
                        Avx.Store((ushort *)output + Vector256 <ushort> .Count, out1.AsUInt16());

                        input  += Vector256 <sbyte> .Count;
                        output += Vector256 <sbyte> .Count;
                    } while (input <= end - Vector256 <sbyte> .Count);

                    if (input == end)
                    {
                        return(true);
                    }
                }

                if (input <= end - Vector128 <sbyte> .Count)
                {
                    Vector128 <sbyte> zero = Vector128 <sbyte> .Zero;

                    do
                    {
                        var vector = Sse2.LoadVector128(input).AsSByte();
                        if (!CheckBytesInAsciiRange(vector, zero))
                        {
                            return(false);
                        }

                        var c0 = Sse2.UnpackLow(vector, zero).AsUInt16();
                        var c1 = Sse2.UnpackHigh(vector, zero).AsUInt16();

                        Sse2.Store((ushort *)output, c0);
                        Sse2.Store((ushort *)output + Vector128 <ushort> .Count, c1);

                        input  += Vector128 <sbyte> .Count;
                        output += Vector128 <sbyte> .Count;
                    } while (input <= end - Vector128 <sbyte> .Count);

                    if (input == end)
                    {
                        return(true);
                    }
                }
            }
            else if (Vector.IsHardwareAccelerated)
            {
                while (input <= end - Vector <sbyte> .Count)
                {
                    var vector = Unsafe.AsRef <Vector <sbyte> >(input);
                    if (!CheckBytesInAsciiRange(vector))
                    {
                        return(false);
                    }

                    Vector.Widen(
                        vector,
                        out Unsafe.AsRef <Vector <short> >(output),
                        out Unsafe.AsRef <Vector <short> >(output + Vector <short> .Count));

                    input  += Vector <sbyte> .Count;
                    output += Vector <sbyte> .Count;
                }

                if (input == end)
                {
                    return(true);
                }
            }

            if (Environment.Is64BitProcess) // Use Intrinsic switch for branch elimination
            {
                // 64-bit: Loop longs by default
                while (input <= end - sizeof(long))
                {
                    var value = *(long *)input;
                    if (!CheckBytesInAsciiRange(value))
                    {
                        return(false);
                    }

                    if (Bmi2.X64.IsSupported)
                    {
                        // BMI2 will work regardless of the processor's endianness.
                        ((ulong *)output)[0] = Bmi2.X64.ParallelBitDeposit((ulong)value, 0x00FF00FF_00FF00FFul);
                        ((ulong *)output)[1] = Bmi2.X64.ParallelBitDeposit((ulong)(value >> 32), 0x00FF00FF_00FF00FFul);
                    }
                    else
                    {
                        output[0] = (char)input[0];
                        output[1] = (char)input[1];
                        output[2] = (char)input[2];
                        output[3] = (char)input[3];
                        output[4] = (char)input[4];
                        output[5] = (char)input[5];
                        output[6] = (char)input[6];
                        output[7] = (char)input[7];
                    }

                    input  += sizeof(long);
                    output += sizeof(long);
                }

                if (input <= end - sizeof(int))
                {
                    var value = *(int *)input;
                    if (!CheckBytesInAsciiRange(value))
                    {
                        return(false);
                    }

                    if (Bmi2.IsSupported)
                    {
                        // BMI2 will work regardless of the processor's endianness.
                        ((uint *)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu);
                        ((uint *)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu);
                    }
                    else
                    {
                        output[0] = (char)input[0];
                        output[1] = (char)input[1];
                        output[2] = (char)input[2];
                        output[3] = (char)input[3];
                    }

                    input  += sizeof(int);
                    output += sizeof(int);
                }
            }
            else
            {
                // 32-bit: Loop ints by default
                while (input <= end - sizeof(int))
                {
                    var value = *(int *)input;
                    if (!CheckBytesInAsciiRange(value))
                    {
                        return(false);
                    }

                    if (Bmi2.IsSupported)
                    {
                        // BMI2 will work regardless of the processor's endianness.
                        ((uint *)output)[0] = Bmi2.ParallelBitDeposit((uint)value, 0x00FF00FFu);
                        ((uint *)output)[1] = Bmi2.ParallelBitDeposit((uint)(value >> 16), 0x00FF00FFu);
                    }
                    else
                    {
                        output[0] = (char)input[0];
                        output[1] = (char)input[1];
                        output[2] = (char)input[2];
                        output[3] = (char)input[3];
                    }

                    input  += sizeof(int);
                    output += sizeof(int);
                }
            }

            if (input <= end - sizeof(short))
            {
                if (!CheckBytesInAsciiRange(((short *)input)[0]))
                {
                    return(false);
                }

                output[0] = (char)input[0];
                output[1] = (char)input[1];

                input  += sizeof(short);
                output += sizeof(short);
            }

            if (input < end)
            {
                if (!CheckBytesInAsciiRange(((sbyte *)input)[0]))
                {
                    return(false);
                }
                output[0] = (char)input[0];
            }

            return(true);
        }
Пример #20
0
        unsafe private static void mixAvx2(Blake2bContext *s, ulong *m)
        {
            var row1 = Avx.LoadVector256(s->h);
            var row2 = Avx.LoadVector256(s->h + 4);

            var row3 = v256iv0;
            var row4 = v256iv1;

            row4 = Avx2.Xor(row4, Avx.LoadVector256(s->t));             // reads into f[] as well

            //ROUND 1
            var m0 = Avx2.BroadcastVector128ToVector256(m);
            var m1 = Avx2.BroadcastVector128ToVector256(m + 2);
            var m2 = Avx2.BroadcastVector128ToVector256(m + 4);
            var m3 = Avx2.BroadcastVector128ToVector256(m + 6);

            var r24 = v256rm0;
            var r16 = v256rm1;

            var t0 = Avx2.UnpackLow(m0, m1);
            var t1 = Avx2.UnpackLow(m2, m3);
            var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m0, m1);
            t1 = Avx2.UnpackHigh(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            var m4 = Avx2.BroadcastVector128ToVector256(m + 8);
            var m5 = Avx2.BroadcastVector128ToVector256(m + 10);
            var m6 = Avx2.BroadcastVector128ToVector256(m + 12);
            var m7 = Avx2.BroadcastVector128ToVector256(m + 14);

            t0 = Avx2.UnpackLow(m4, m5);
            t1 = Avx2.UnpackLow(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m4, m5);
            t1 = Avx2.UnpackHigh(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 2
            t0 = Avx2.UnpackLow(m7, m2);
            t1 = Avx2.UnpackHigh(m4, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.AlignRight(m3, m7, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Shuffle(m0.AsUInt32(), 0b_01_00_11_10).AsUInt64();
            t1 = Avx2.UnpackHigh(m5, m2);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m6, m1);
            t1 = Avx2.UnpackHigh(m3, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 3
            t0 = Avx2.AlignRight(m6, m5, 8);
            t1 = Avx2.UnpackHigh(m2, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m4, m0);
            t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Blend(m5.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.UnpackHigh(m3, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m7, m3);
            t1 = Avx2.AlignRight(m2, m0, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 4
            t0 = Avx2.UnpackHigh(m3, m1);
            t1 = Avx2.UnpackHigh(m6, m5);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m4, m0);
            t1 = Avx2.UnpackLow(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m3, m5);
            t1 = Avx2.UnpackLow(m0, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 5
            t0 = Avx2.UnpackHigh(m4, m2);
            t1 = Avx2.UnpackLow(m1, m5);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Blend(m7.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.Blend(m3.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m6, m0, 8);
            t1 = Avx2.Blend(m4.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 6
            t0 = Avx2.UnpackLow(m1, m3);
            t1 = Avx2.UnpackLow(m0, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m6, m5);
            t1 = Avx2.UnpackHigh(m5, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Blend(m2.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.UnpackHigh(m7, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m6, m2);
            t1 = Avx2.Blend(m7.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 7
            t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.UnpackLow(m7, m2);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m2, m7);
            t1 = Avx2.AlignRight(m5, m6, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.UnpackLow(m0, m3);
            t1 = Avx2.Shuffle(m4.AsUInt32(), 0b_01_00_11_10).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m3, m1);
            t1 = Avx2.Blend(m1.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 8
            t0 = Avx2.UnpackHigh(m6, m3);
            t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m7, m5, 8);
            t1 = Avx2.UnpackHigh(m0, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.UnpackHigh(m2, m7);
            t1 = Avx2.UnpackLow(m4, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m0, m2);
            t1 = Avx2.UnpackLow(m3, m5);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 9
            t0 = Avx2.UnpackLow(m3, m7);
            t1 = Avx2.AlignRight(m0, m5, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m7, m4);
            t1 = Avx2.AlignRight(m4, m1, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = m6;
            t1 = Avx2.AlignRight(m5, m0, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = m2;
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 10
            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.UnpackHigh(m3, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m1, m2);
            t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.UnpackHigh(m7, m4);
            t1 = Avx2.UnpackHigh(m1, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m7, m5, 8);
            t1 = Avx2.UnpackLow(m6, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 11
            t0 = Avx2.UnpackLow(m0, m1);
            t1 = Avx2.UnpackLow(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m0, m1);
            t1 = Avx2.UnpackHigh(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.UnpackLow(m4, m5);
            t1 = Avx2.UnpackLow(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m4, m5);
            t1 = Avx2.UnpackHigh(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            //ROUND 12
            t0 = Avx2.UnpackLow(m7, m2);
            t1 = Avx2.UnpackHigh(m4, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.AlignRight(m3, m7, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_10_01_00_11);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_00_11_10_01);

            t0 = Avx2.Shuffle(m0.AsUInt32(), 0b_01_00_11_10).AsUInt64();
            t1 = Avx2.UnpackHigh(m5, m2);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsSByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m6, m1);
            t1 = Avx2.UnpackHigh(m3, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsSByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row4 = Avx2.Permute4x64(row4, 0b_00_11_10_01);
            row3 = Avx2.Permute4x64(row3, 0b_01_00_11_10);
            row2 = Avx2.Permute4x64(row2, 0b_10_01_00_11);

            row1 = Avx2.Xor(row1, row3);
            row2 = Avx2.Xor(row2, row4);
            row1 = Avx2.Xor(row1, Avx2.LoadVector256(s->h));
            row2 = Avx2.Xor(row2, Avx2.LoadVector256(s->h + 4));

            Avx2.Store(s->h, row1);
            Avx2.Store(s->h + 4, row2);
        }
Пример #21
0
        public void Serialize(ref MessagePackWriter writer, bool[]?value, MessagePackSerializerOptions options)
        {
            if (value == null)
            {
                writer.WriteNil();
                return;
            }

            var inputLength = value.Length;

            writer.WriteArrayHeader(inputLength);
            if (inputLength == 0)
            {
                return;
            }

            var outputLength = inputLength;

            fixed(bool *pSource = &value[0])
            {
                var inputEnd      = pSource + inputLength;
                var inputIterator = pSource;
                var destination   = writer.GetSpan(inputLength);

                fixed(byte *pDestination = &destination[0])
                {
                    var outputIterator = pDestination;

                    if (Avx2.IsSupported)
                    {
                        const int ShiftCount = 5;
                        const int Stride     = 1 << ShiftCount;
                        if (inputLength < Stride << 1)
                        {
                            goto ProcessEach;
                        }

                        {
                            // make output span align 32
                            var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign32(outputIterator);
                            inputLength -= offset;
                            var offsetEnd = inputIterator + offset;
                            while (inputIterator != offsetEnd)
                            {
                                *outputIterator++ = *inputIterator++ ? MessagePackCode.True : MessagePackCode.False;
                            }
                        }

                        var vectorTrue       = Vector256.Create(MessagePackCode.True).AsSByte();
                        var vectorLoopLength = (inputLength >> ShiftCount) << ShiftCount;
                        for (var vectorizedEnd = inputIterator + vectorLoopLength; inputIterator != vectorizedEnd; inputIterator += Stride, outputIterator += Stride)
                        {
                            // Load 32 bool values.
                            var current = Avx.LoadVector256((sbyte *)inputIterator);

                            // A value of false for the type bool is 0 for the sbyte representation.
                            var isTrue = Avx2.CompareEqual(current, Vector256 <sbyte> .Zero);
                            // A value of true in the SIMD context is -1 for the sbyte representation.
                            // True is 0xc3 as MessagePackCode and false is 0xc2.
                            // Reinterpreted as sbyte values, they are -61 and -62, respectively.
                            // For each of the 32 true Vectors, we can add -1 to the false ones to get the answer.
                            var answer = Avx2.Add(vectorTrue, isTrue);
                            Avx.Store((sbyte *)outputIterator, answer);
                        }
                    }
                    else if (Sse2.IsSupported)
                    {
                        // for older x86 cpu
                        const int ShiftCount = 4;
                        const int Stride     = 1 << ShiftCount;
                        if (inputLength < Stride << 1)
                        {
                            goto ProcessEach;
                        }

                        {
                            // make output span align 16
                            var offset = UnsafeMemoryAlignmentUtility.CalculateDifferenceAlign16(outputIterator);
                            inputLength -= offset;
                            var offsetEnd = inputIterator + offset;
                            while (inputIterator != offsetEnd)
                            {
                                *outputIterator++ = *inputIterator++ ? MessagePackCode.True : MessagePackCode.False;
                            }
                        }

                        var vectorTrue       = Vector128.Create(MessagePackCode.True).AsSByte();
                        var vectorLoopLength = (inputLength >> ShiftCount) << ShiftCount;
                        for (var vectorizedEnd = inputIterator + vectorLoopLength; inputIterator != vectorizedEnd; inputIterator += Stride, outputIterator += Stride)
                        {
                            // Load 16 bool values.
                            var current = Sse2.LoadVector128((sbyte *)inputIterator);

                            // A value of false for the type bool is 0 for the sbyte representation.
                            var isTrue = Sse2.CompareEqual(current, Vector128 <sbyte> .Zero);
                            // A value of true in the SIMD context is -1 for the sbyte representation.
                            // True is 0xc3 as MessagePackCode and false is 0xc2.
                            // Reinterpreted as sbyte values, they are -61 and -62, respectively.
                            // For each of the 16 true Vectors, we can add -1 to the false ones to get the answer.
                            var answer = Sse2.Add(vectorTrue, isTrue);
                            Sse2.Store((sbyte *)outputIterator, answer);
                        }
                    }

ProcessEach:
                    while (inputIterator != inputEnd)
                    {
                        *outputIterator++ = *inputIterator++ ? MessagePackCode.True : MessagePackCode.False;
                    }
                }

                writer.Advance(outputLength);
            }
        }
Пример #22
0
        public static float DotMultiplyIntrinsicWFmaWSpanPtr(ref Memory <float> vector1, ref Memory <float> vector2)
        {
            var span1   = vector1.Span;
            var span2   = vector2.Span;
            var cnt     = Math.Min(span1.Length, span2.Length);
            var v3      = Vector256.CreateScalarUnsafe(0f);
            var vectLen = Vector256 <float> .Count;
            var vectCnt = cnt / vectLen;
            var total   = 0f;

#if TEST
            var file = Path.GetTempFileName();
            using var writer = new StreamWriter(file);
            Console.WriteLine($"Intrinsic with FmaWPtr Mult. results will be written into {file}");
#endif

            unsafe
            {
                int i;
                var ptr1 = (float *)Unsafe.AsPointer(ref span1[0]);
                var ptr2 = (float *)Unsafe.AsPointer(ref span2[0]);

                for (i = 0; i < vectCnt; i++)
                {
                    var v1 = Avx.LoadVector256(ptr1);
                    var v2 = Avx.LoadVector256(ptr2);
                    v3    = Fma.MultiplyAdd(v1, v2, v3);
                    ptr1 += vectLen;
                    ptr2 += vectLen;
#if TEST
                    writer.WriteLine($"{v1.ToString()}\t{v2.ToString()}\t{v3.ToString()}");
#endif
                }

                for (i = 0; i < vectLen; i++)
                {
                    total += v3.GetElement(i);
                }

                i = vectCnt * vectLen;
                if (cnt % vectLen > 0)
                {
                    ptr1 = (float *)Unsafe.AsPointer(ref span1[i]);
                    ptr2 = (float *)Unsafe.AsPointer(ref span2[i]);
                    for (; i < cnt; i++)
                    {
                        total += *ptr1++ **ptr2++;
                    }
                }
            }

            if (vector1.Length != vector2.Length)
            {
                var h = vector1.Length > vector2.Length ? span1 : span2;
                for (var j = cnt; j < h.Length; j++)
                {
                    total += h[j];
                }
            }

            return(total);
        }
Пример #23
0
        private static void mixAvx2(ulong *sh, ulong *m)
        {
            // Rotate shuffle masks. We can safely convert the ref to a pointer because the compiler guarantees the
            // data is in a fixed location, and the ref itself is converted from a pointer. Same for the IV below.
            byte *prm = (byte *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(rormask));
            var   r24 = Avx2.BroadcastVector128ToVector256(prm);
            var   r16 = Avx2.BroadcastVector128ToVector256(prm + Vector128 <byte> .Count);

            var row1 = Avx.LoadVector256(sh);
            var row2 = Avx.LoadVector256(sh + Vector256 <ulong> .Count);

            ulong *piv  = (ulong *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(ivle));
            var    row3 = Avx.LoadVector256(piv);
            var    row4 = Avx.LoadVector256(piv + Vector256 <ulong> .Count);

            row4 = Avx2.Xor(row4, Avx.LoadVector256(sh + Vector256 <ulong> .Count * 2));           // t[] and f[]

            //ROUND 1
            var m0 = Avx2.BroadcastVector128ToVector256(m);
            var m1 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count);
            var m2 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 2);
            var m3 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 3);

            var t0 = Avx2.UnpackLow(m0, m1);
            var t1 = Avx2.UnpackLow(m2, m3);
            var b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m0, m1);
            t1 = Avx2.UnpackHigh(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            var m4 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 4);
            var m5 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 5);
            var m6 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 6);
            var m7 = Avx2.BroadcastVector128ToVector256(m + Vector128 <ulong> .Count * 7);

            t0 = Avx2.UnpackLow(m7, m4);
            t1 = Avx2.UnpackLow(m5, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m7, m4);
            t1 = Avx2.UnpackHigh(m5, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 2
            t0 = Avx2.UnpackLow(m7, m2);
            t1 = Avx2.UnpackHigh(m4, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.AlignRight(m3, m7, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.UnpackHigh(m2, m0);
            t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m6, m1, 8);
            t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 3
            t0 = Avx2.AlignRight(m6, m5, 8);
            t1 = Avx2.UnpackHigh(m2, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m4, m0);
            t1 = Avx2.Blend(m1.AsUInt32(), m6.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.AlignRight(m5, m4, 8);
            t1 = Avx2.UnpackHigh(m1, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m2, m7);
            t1 = Avx2.Blend(m3.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 4
            t0 = Avx2.UnpackHigh(m3, m1);
            t1 = Avx2.UnpackHigh(m6, m5);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m4, m0);
            t1 = Avx2.UnpackLow(m6, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.AlignRight(m1, m7, 8);
            t1 = Avx2.Shuffle(m2.AsUInt32(), 0b_01_00_11_10).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m4, m3);
            t1 = Avx2.UnpackLow(m5, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 5
            t0 = Avx2.UnpackHigh(m4, m2);
            t1 = Avx2.UnpackLow(m1, m5);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.Blend(m0.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.Blend(m2.AsUInt32(), m7.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.AlignRight(m7, m1, 8);
            t1 = Avx2.AlignRight(m3, m5, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m6, m0);
            t1 = Avx2.UnpackLow(m6, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 6
            t0 = Avx2.UnpackLow(m1, m3);
            t1 = Avx2.UnpackLow(m0, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m6, m5);
            t1 = Avx2.UnpackHigh(m5, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.AlignRight(m2, m0, 8);
            t1 = Avx2.UnpackHigh(m3, m7);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m4, m6);
            t1 = Avx2.AlignRight(m7, m2, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 7
            t0 = Avx2.Blend(m6.AsUInt32(), m0.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.UnpackLow(m7, m2);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m2, m7);
            t1 = Avx2.AlignRight(m5, m6, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.UnpackLow(m4, m0);
            t1 = Avx2.Blend(m3.AsUInt32(), m4.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m5, m3);
            t1 = Avx2.Shuffle(m1.AsUInt32(), 0b_01_00_11_10).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 8
            t0 = Avx2.UnpackHigh(m6, m3);
            t1 = Avx2.Blend(m6.AsUInt32(), m1.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m7, m5, 8);
            t1 = Avx2.UnpackHigh(m0, m4);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.Blend(m1.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.AlignRight(m4, m7, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m5, m0);
            t1 = Avx2.UnpackLow(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 9
            t0 = Avx2.UnpackLow(m3, m7);
            t1 = Avx2.AlignRight(m0, m5, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m7, m4);
            t1 = Avx2.AlignRight(m4, m1, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.UnpackLow(m5, m6);
            t1 = Avx2.UnpackHigh(m6, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m1, m2, 8);
            t1 = Avx2.AlignRight(m2, m3, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 10
            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.UnpackHigh(m3, m0);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m1, m2);
            t1 = Avx2.Blend(m3.AsUInt32(), m2.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.UnpackHigh(m6, m7);
            t1 = Avx2.UnpackHigh(m4, m1);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
            t1 = Avx2.UnpackLow(m7, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 11
            t0 = Avx2.UnpackLow(m0, m1);
            t1 = Avx2.UnpackLow(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m0, m1);
            t1 = Avx2.UnpackHigh(m2, m3);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.UnpackLow(m7, m4);
            t1 = Avx2.UnpackLow(m5, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackHigh(m7, m4);
            t1 = Avx2.UnpackHigh(m5, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            //ROUND 12
            t0 = Avx2.UnpackLow(m7, m2);
            t1 = Avx2.UnpackHigh(m4, m6);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.UnpackLow(m5, m4);
            t1 = Avx2.AlignRight(m3, m7, 8);
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //DIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_10_01_00_11);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_00_11_10_01);

            t0 = Avx2.UnpackHigh(m2, m0);
            t1 = Avx2.Blend(m0.AsUInt32(), m5.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G1
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsUInt32(), 0b_10_11_00_01).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Shuffle(row2.AsByte(), r24).AsUInt64();

            t0 = Avx2.AlignRight(m6, m1, 8);
            t1 = Avx2.Blend(m1.AsUInt32(), m3.AsUInt32(), 0b_1100_1100).AsUInt64();
            b0 = Avx2.Blend(t0.AsUInt32(), t1.AsUInt32(), 0b_1111_0000).AsUInt64();

            //G2
            row1 = Avx2.Add(Avx2.Add(row1, b0), row2);
            row4 = Avx2.Xor(row4, row1);
            row4 = Avx2.Shuffle(row4.AsByte(), r16).AsUInt64();

            row3 = Avx2.Add(row3, row4);
            row2 = Avx2.Xor(row2, row3);
            row2 = Avx2.Xor(Avx2.ShiftRightLogical(row2, 63), Avx2.Add(row2, row2));

            //UNDIAGONALIZE
            row1 = Avx2.Permute4x64(row1, 0b_00_11_10_01);
            row4 = Avx2.Permute4x64(row4, 0b_01_00_11_10);
            row3 = Avx2.Permute4x64(row3, 0b_10_01_00_11);

            row1 = Avx2.Xor(row1, row3);
            row2 = Avx2.Xor(row2, row4);
            row1 = Avx2.Xor(row1, Avx.LoadVector256(sh));
            row2 = Avx2.Xor(row2, Avx.LoadVector256(sh + Vector256 <ulong> .Count));

            Avx.Store(sh, row1);
            Avx.Store(sh + Vector256 <ulong> .Count, row2);
        }
        /* Routine optimized for unshuffling a buffer for a type size larger than 16 bytes. */
        private static unsafe void unshuffle16_tiled_avx2(byte *dest, byte *src,
                                                          int vectorizable_elements, int total_elements, int bytesoftype)
        {
            int i;
            int j;
            var ymm0 = new Vector256 <byte> [16];
            var ymm1 = new Vector256 <byte> [16];

            var remainder = bytesoftype % sizeof(Vector128 <byte>);
            int vecs_rem  = remainder;

            /* The unshuffle loops are inverted (compared to shuffle_tiled16_avx2)
             * to optimize cache utilization. */
            int offset_into_type;

            for (offset_into_type = 0; offset_into_type < bytesoftype;
                 offset_into_type += (offset_into_type == 0 && vecs_rem > 0 ? vecs_rem : (int)sizeof(Vector128 <byte>)))
            {
                for (i = 0; i < vectorizable_elements; i += sizeof(Vector256 <byte>))
                {
                    /* Load the first 16 bytes of 32 adjacent elements (512 bytes) into 16 YMM registers */
                    byte *src_for_ith_element = src + i;
                    for (j = 0; j < 16; j++)
                    {
                        ymm0[j] = Avx.LoadVector256((src_for_ith_element + (total_elements * (offset_into_type + j))));
                    }

                    /* Shuffle bytes */
                    for (j = 0; j < 8; j++)
                    {
                        /* Compute the low 32 bytes */
                        ymm1[j] = Avx2.UnpackLow(ymm0[j * 2], ymm0[j * 2 + 1]);
                        /* Compute the hi 32 bytes */
                        ymm1[8 + j] = Avx2.UnpackHigh(ymm0[j * 2], ymm0[j * 2 + 1]);
                    }
                    /* Shuffle 2-byte words */
                    for (j = 0; j < 8; j++)
                    {
                        /* Compute the low 32 bytes */
                        ymm0[j] = Avx2.UnpackLow(ymm1[j * 2].AsInt16(), ymm1[j * 2 + 1].AsInt16()).AsByte();
                        /* Compute the hi 32 bytes */
                        ymm0[8 + j] = Avx2.UnpackHigh(ymm1[j * 2].AsInt16(), ymm1[j * 2 + 1].AsInt16()).AsByte();
                    }
                    /* Shuffle 4-byte dwords */
                    for (j = 0; j < 8; j++)
                    {
                        /* Compute the low 32 bytes */
                        ymm1[j] = Avx2.UnpackLow(ymm0[j * 2].AsInt32(), ymm0[j * 2 + 1].AsInt32()).AsByte();
                        /* Compute the hi 32 bytes */
                        ymm1[8 + j] = Avx2.UnpackHigh(ymm0[j * 2].AsInt32(), ymm0[j * 2 + 1].AsInt32()).AsByte();
                    }

                    /* Shuffle 8-byte qwords */
                    for (j = 0; j < 8; j++)
                    {
                        /* Compute the low 32 bytes */
                        ymm0[j] = Avx2.UnpackLow(ymm1[j * 2].AsInt64(), ymm1[j * 2 + 1].AsInt64()).AsByte();
                        /* Compute the hi 32 bytes */
                        ymm0[8 + j] = Avx2.UnpackHigh(ymm1[j * 2].AsInt64(), ymm1[j * 2 + 1].AsInt64()).AsByte();
                    }

                    for (j = 0; j < 8; j++)
                    {
                        ymm1[j]     = Avx2.Permute2x128(ymm0[j], ymm0[j + 8], 0x20);
                        ymm1[j + 8] = Avx2.Permute2x128(ymm0[j], ymm0[j + 8], 0x31);
                    }

                    /* Store the result vectors in proper order */
                    byte *dest_with_offset = dest + offset_into_type;
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x01) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x00) * bytesoftype), ymm1[0]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x03) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x02) * bytesoftype), ymm1[4]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x05) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x04) * bytesoftype), ymm1[2]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x07) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x06) * bytesoftype), ymm1[6]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x09) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x08) * bytesoftype), ymm1[1]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x0b) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x0a) * bytesoftype), ymm1[5]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x0d) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x0c) * bytesoftype), ymm1[3]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x0f) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x0e) * bytesoftype), ymm1[7]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x11) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x10) * bytesoftype), ymm1[8]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x13) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x12) * bytesoftype), ymm1[12]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x15) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x14) * bytesoftype), ymm1[10]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x17) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x16) * bytesoftype), ymm1[14]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x19) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x18) * bytesoftype), ymm1[9]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x1b) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x1a) * bytesoftype), ymm1[13]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x1d) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x1c) * bytesoftype), ymm1[11]);
                    _mm256_storeu2_m128i(
                        (byte *)(dest_with_offset + (i + 0x1f) * bytesoftype),
                        (byte *)(dest_with_offset + (i + 0x1e) * bytesoftype), ymm1[15]);
                }
            }
        }
Пример #25
0
        public static unsafe bool TryGetAsciiString(byte *input, char *output, int count)
        {
            Debug.Assert(input != null);
            Debug.Assert(output != null);

            var end = input + count;

            Debug.Assert((long)end >= Vector256 <sbyte> .Count);

            // PERF: so the JIT can reuse the zero from a register
            Vector128 <sbyte> zero = Vector128 <sbyte> .Zero;

            if (Sse2.IsSupported)
            {
                if (Avx2.IsSupported && input <= end - Vector256 <sbyte> .Count)
                {
                    Vector256 <sbyte> avxZero = Vector256 <sbyte> .Zero;

                    do
                    {
                        var vector = Avx.LoadVector256(input).AsSByte();
                        if (!CheckBytesInAsciiRange(vector, avxZero))
                        {
                            return(false);
                        }

                        var tmp0 = Avx2.UnpackLow(vector, avxZero);
                        var tmp1 = Avx2.UnpackHigh(vector, avxZero);

                        // Bring into the right order
                        var out0 = Avx2.Permute2x128(tmp0, tmp1, 0x20);
                        var out1 = Avx2.Permute2x128(tmp0, tmp1, 0x31);

                        Avx.Store((ushort *)output, out0.AsUInt16());
                        Avx.Store((ushort *)output + Vector256 <ushort> .Count, out1.AsUInt16());

                        input  += Vector256 <sbyte> .Count;
                        output += Vector256 <sbyte> .Count;
                    } while (input <= end - Vector256 <sbyte> .Count);

                    if (input == end)
                    {
                        return(true);
                    }
                }

                if (input <= end - Vector128 <sbyte> .Count)
                {
                    do
                    {
                        var vector = Sse2.LoadVector128(input).AsSByte();
                        if (!CheckBytesInAsciiRange(vector, zero))
                        {
                            return(false);
                        }

                        var c0 = Sse2.UnpackLow(vector, zero).AsUInt16();
                        var c1 = Sse2.UnpackHigh(vector, zero).AsUInt16();

                        Sse2.Store((ushort *)output, c0);
                        Sse2.Store((ushort *)output + Vector128 <ushort> .Count, c1);

                        input  += Vector128 <sbyte> .Count;
                        output += Vector128 <sbyte> .Count;
                    } while (input <= end - Vector128 <sbyte> .Count);

                    if (input == end)
                    {
                        return(true);
                    }
                }
            }
            else if (Vector.IsHardwareAccelerated)
            {
                while (input <= end - Vector <sbyte> .Count)
                {
                    var vector = Unsafe.AsRef <Vector <sbyte> >(input);
                    if (!CheckBytesInAsciiRange(vector))
                    {
                        return(false);
                    }

                    Vector.Widen(
                        vector,
                        out Unsafe.AsRef <Vector <short> >(output),
                        out Unsafe.AsRef <Vector <short> >(output + Vector <short> .Count));

                    input  += Vector <sbyte> .Count;
                    output += Vector <sbyte> .Count;
                }

                if (input == end)
                {
                    return(true);
                }
            }

            if (Environment.Is64BitProcess) // Use Intrinsic switch for branch elimination
            {
                // 64-bit: Loop longs by default
                while (input <= end - sizeof(long))
                {
                    var value = *(long *)input;
                    if (!CheckBytesInAsciiRange(value))
                    {
                        return(false);
                    }

                    // BMI2 could be used, but this variant is faster on both Intel and AMD.
                    if (Sse2.X64.IsSupported)
                    {
                        Vector128 <sbyte> vecNarrow = Sse2.X64.ConvertScalarToVector128Int64(value).AsSByte();
                        Vector128 <ulong> vecWide   = Sse2.UnpackLow(vecNarrow, zero).AsUInt64();
                        Sse2.Store((ulong *)output, vecWide);
                    }
                    else
                    {
                        output[0] = (char)input[0];
                        output[1] = (char)input[1];
                        output[2] = (char)input[2];
                        output[3] = (char)input[3];
                        output[4] = (char)input[4];
                        output[5] = (char)input[5];
                        output[6] = (char)input[6];
                        output[7] = (char)input[7];
                    }

                    input  += sizeof(long);
                    output += sizeof(long);
                }

                if (input <= end - sizeof(int))
                {
                    var value = *(int *)input;
                    if (!CheckBytesInAsciiRange(value))
                    {
                        return(false);
                    }

                    WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero);

                    input  += sizeof(int);
                    output += sizeof(int);
                }
            }
            else
            {
                // 32-bit: Loop ints by default
                while (input <= end - sizeof(int))
                {
                    var value = *(int *)input;
                    if (!CheckBytesInAsciiRange(value))
                    {
                        return(false);
                    }

                    WidenFourAsciiBytesToUtf16AndWriteToBuffer(output, input, value, zero);

                    input  += sizeof(int);
                    output += sizeof(int);
                }
            }

            if (input <= end - sizeof(short))
            {
                if (!CheckBytesInAsciiRange(((short *)input)[0]))
                {
                    return(false);
                }

                output[0] = (char)input[0];
                output[1] = (char)input[1];

                input  += sizeof(short);
                output += sizeof(short);
            }

            if (input < end)
            {
                if (!CheckBytesInAsciiRange(((sbyte *)input)[0]))
                {
                    return(false);
                }
                output[0] = (char)input[0];
            }

            return(true);
        }
        unsafe public int[,] IntegrateUnsafeVector()
        {
            if (!Avx2.IsSupported)
            {
                throw new InvalidOperationException("Avx2 is not supported - cannot perform vector operation");
            }
            int w = _data.Width();
            int h = _data.Height();

            int[,] res = new int[h, w];

            fixed(byte *pSource = &_data[0, 0])
            fixed(int *pTarget = &res[0, 0])
            {
                var pSrc = pSource;
                var pTrg = pTarget;

                {// handle the first line
                    var j = 0;
                    var c = 0;
                    //handle vector part
                    for (; j + Vector256 <int> .Count <= w; j += Vector256 <int> .Count)
                    {
                        var t = Avx2.ConvertToVector256Int32(pSrc);

                        t = Aggregate(t, c);

                        Avx.Store(pTrg, t);
                        c = t.GetElement(Vector256 <int> .Count - 1);

                        pSrc += Vector256 <int> .Count;
                        pTrg += Vector256 <int> .Count;
                    }

                    // handle the tail
                    for (; j < w; j++)
                    {
                        c += *pSrc++;
                        *pTrg++ = c;
                    }
                }
                //handle the other lines

                for (var i = 1; i < h; i++)
                {
                    var j = 0;
                    var c = 0;
                    //handle vector part
                    for (; j + Vector256 <int> .Count <= w; j += Vector256 <int> .Count)
                    {
                        var t = Avx2.ConvertToVector256Int32(pSrc);

                        t = Aggregate(t, c);

                        c = t.GetElement(Vector256 <int> .Count - 1);

                        var p = Avx.LoadVector256(pTrg - w); // prev line vector

                        t = Avx2.Add(t, p);

                        Avx.Store(pTrg, t);
                        pSrc += Vector256 <int> .Count;
                        pTrg += Vector256 <int> .Count;
                    }

                    // handle the tail
                    for (; j < w; j++)
                    {
                        c += *pSrc++;

                        var q = *(pTrg - w);

                        *pTrg++ = c + q;
                    }
                }
            }
            return(res);
        }
Пример #27
0
        private void Word_32Bytes_WithPrefetch_Internal <TPrefetchConfiguration>(void *originalBuffer, void *modifiedBuffer, int size)
            where TPrefetchConfiguration : struct, IPrefetchConfiguration
        {
            Debug.Assert(size % 4096 == 0);
            Debug.Assert(size % sizeof(long) == 0);

            TPrefetchConfiguration config = default;

            byte *writePtr       = destination;
            long  writePtrOffset = 16;

            // This stops the JIT from accesing originalBuffer directly, as we know
            // it is not mutable, this lowers the number of generated instructions
            byte *ptr    = (byte *)originalBuffer;
            long  offset = (byte *)modifiedBuffer - (byte *)originalBuffer;

            bool started = false;

            for (byte *end = ptr + size; ptr < end; ptr += 32)
            {
                Vector256 <byte> o = Avx.LoadVector256(ptr);
                Vector256 <byte> m = Avx.LoadVector256(ptr + offset);

                o = Avx2.Xor(o, m);
                if (!Avx.TestZ(o, o))
                {
                    if (started == false)
                    {
                        // Write the start index of the run based from the start of the page we are diffing.
                        *(long *)(writePtr + 0) = ptr - (byte *)originalBuffer;
                        started = true;
                    }

                    Avx.Store((writePtr + writePtrOffset), m);
                    writePtrOffset += 32;
                }
                else if (started) // our byte is untouched here.
                {
                    // We write the actual size of the stored data.
                    *(long *)(writePtr + 8) = writePtrOffset - 16;

                    // We advance the write pointer to the start of the next.
                    writePtr += writePtrOffset;

                    // We reset the write pointer but not before actually substracting the written amount
                    // from the available space.
                    writePtrOffset = 16;

                    started = false;
                }

                config.Prefetch(ptr + 2048);
                config.Prefetch(ptr + offset + 2048);
            }

            // If the block hasnt been touched, nothing to do here unless we have an open pointer.
            if (started)
            {
                // We write the actual size of the stored data.
                *(long *)(writePtr + 8) = writePtrOffset - 16;
            }
        }
        /* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
        private static unsafe void shuffle16_avx2(byte *dest, byte *src,
                                                  int vectorizable_elements, int total_elements)
        {
            int bytesoftype = 16;
            int j;
            int k, l;
            var ymm0 = new Vector256 <byte> [16];
            var ymm1 = new Vector256 <byte> [16];

            /* Create the shuffle mask.
             * NOTE: The XMM/YMM 'set' intrinsics require the arguments to be ordered from
             * most to least significant (i.e., their order is reversed when compared to
             * loading the mask from an array). */
            var shmask = Vector256.Create((byte)
                                          0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
                                          0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00,
                                          0x0f, 0x07, 0x0e, 0x06, 0x0d, 0x05, 0x0c, 0x04,
                                          0x0b, 0x03, 0x0a, 0x02, 0x09, 0x01, 0x08, 0x00);

            for (j = 0; j < vectorizable_elements; j += sizeof(Vector256 <byte>))
            {
                /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
                for (k = 0; k < 16; k++)
                {
                    ymm0[k] = Avx.LoadVector256((src + (j * bytesoftype) + (k * sizeof(Vector256 <byte>))));
                }
                /* Transpose bytes */
                for (k = 0, l = 0; k < 8; k++, l += 2)
                {
                    ymm1[k * 2]     = Avx2.UnpackLow(ymm0[l], ymm0[l + 1]);
                    ymm1[k * 2 + 1] = Avx2.UnpackHigh(ymm0[l], ymm0[l + 1]);
                }
                /* Transpose words */
                for (k = 0, l = -2; k < 8; k++, l++)
                {
                    if ((k % 2) == 0)
                    {
                        l += 2;
                    }
                    ymm0[k * 2]     = Avx2.UnpackLow(ymm1[l].AsInt16(), ymm1[l + 2].AsInt16()).AsByte();
                    ymm0[k * 2 + 1] = Avx2.UnpackHigh(ymm1[l].AsInt16(), ymm1[l + 2].AsInt16()).AsByte();
                }
                /* Transpose double words */
                for (k = 0, l = -4; k < 8; k++, l++)
                {
                    if ((k % 4) == 0)
                    {
                        l += 4;
                    }
                    ymm1[k * 2]     = Avx2.UnpackLow(ymm0[l].AsInt32(), ymm0[l + 4].AsInt32()).AsByte();
                    ymm1[k * 2 + 1] = Avx2.UnpackHigh(ymm0[l].AsInt32(), ymm0[l + 4].AsInt32()).AsByte();
                }
                /* Transpose quad words */
                for (k = 0; k < 8; k++)
                {
                    ymm0[k * 2]     = Avx2.UnpackLow(ymm1[k].AsInt64(), ymm1[k + 8].AsInt64()).AsByte();
                    ymm0[k * 2 + 1] = Avx2.UnpackHigh(ymm1[k].AsInt64(), ymm1[k + 8].AsInt64()).AsByte();
                }
                for (k = 0; k < 16; k++)
                {
                    ymm0[k] = Avx2.Permute4x64(ymm0[k].AsInt64(), 0xd8).AsByte();
                    ymm0[k] = Avx2.Shuffle(ymm0[k], shmask);
                }
                /* Store the result vectors */
                byte *dest_for_jth_element = dest + j;
                for (k = 0; k < 16; k++)
                {
                    Avx2.Store((dest_for_jth_element + (k * total_elements)), ymm0[k]);
                }
            }
        }
Пример #29
0
        public void RunLclVarScenario_Load()
        {
            var firstOp = Avx.LoadVector256((Byte *)(_dataTable.inArrayPtr));

            Avx2.ExtractVector128((Byte *)_dataTable.outArrayPtr, firstOp, 1);
        }
        /* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
        private static unsafe void unshuffle16_avx2(byte *dest, byte *src,
                                                    int vectorizable_elements, int total_elements)
        {
            int bytesoftype = 16;
            int i;
            int j;
            var ymm0 = new Vector256 <byte> [16];
            var ymm1 = new Vector256 <byte> [16];

            for (i = 0; i < vectorizable_elements; i += sizeof(Vector256 <byte>))
            {
                /* Fetch 32 elements (512 bytes) into 16 YMM registers. */
                byte *src_for_ith_element = src + i;
                for (j = 0; j < 16; j++)
                {
                    ymm0[j] = Avx.LoadVector256((src_for_ith_element + (j * total_elements)));
                }

                /* Shuffle bytes */
                for (j = 0; j < 8; j++)
                {
                    /* Compute the low 32 bytes */
                    ymm1[j] = Avx2.UnpackLow(ymm0[j * 2], ymm0[j * 2 + 1]);
                    /* Compute the hi 32 bytes */
                    ymm1[8 + j] = Avx2.UnpackHigh(ymm0[j * 2], ymm0[j * 2 + 1]);
                }
                /* Shuffle 2-byte words */
                for (j = 0; j < 8; j++)
                {
                    /* Compute the low 32 bytes */
                    ymm0[j] = Avx2.UnpackLow(ymm1[j * 2].AsInt16(), ymm1[j * 2 + 1].AsInt16()).AsByte();
                    /* Compute the hi 32 bytes */
                    ymm0[8 + j] = Avx2.UnpackHigh(ymm1[j * 2].AsInt16(), ymm1[j * 2 + 1].AsInt16()).AsByte();
                }
                /* Shuffle 4-byte dwords */
                for (j = 0; j < 8; j++)
                {
                    /* Compute the low 32 bytes */
                    ymm1[j] = Avx2.UnpackLow(ymm0[j * 2].AsInt32(), ymm0[j * 2 + 1].AsInt32()).AsByte();
                    /* Compute the hi 32 bytes */
                    ymm1[8 + j] = Avx2.UnpackHigh(ymm0[j * 2].AsInt32(), ymm0[j * 2 + 1].AsInt32()).AsByte();
                }

                /* Shuffle 8-byte qwords */
                for (j = 0; j < 8; j++)
                {
                    /* Compute the low 32 bytes */
                    ymm0[j] = Avx2.UnpackLow(ymm1[j * 2].AsInt64(), ymm1[j * 2 + 1].AsInt64()).AsByte();
                    /* Compute the hi 32 bytes */
                    ymm0[8 + j] = Avx2.UnpackHigh(ymm1[j * 2].AsInt64(), ymm1[j * 2 + 1].AsInt64()).AsByte();
                }

                for (j = 0; j < 8; j++)
                {
                    ymm1[j]     = Avx2.Permute2x128(ymm0[j], ymm0[j + 8], 0x20);
                    ymm1[j + 8] = Avx2.Permute2x128(ymm0[j], ymm0[j + 8], 0x31);
                }

                /* Store the result vectors in proper order */
                Avx2.Store((dest + (i * bytesoftype) + (0 * sizeof(Vector256 <byte>))), ymm1[0]);
                Avx2.Store((dest + (i * bytesoftype) + (1 * sizeof(Vector256 <byte>))), ymm1[4]);
                Avx2.Store((dest + (i * bytesoftype) + (2 * sizeof(Vector256 <byte>))), ymm1[2]);
                Avx2.Store((dest + (i * bytesoftype) + (3 * sizeof(Vector256 <byte>))), ymm1[6]);
                Avx2.Store((dest + (i * bytesoftype) + (4 * sizeof(Vector256 <byte>))), ymm1[1]);
                Avx2.Store((dest + (i * bytesoftype) + (5 * sizeof(Vector256 <byte>))), ymm1[5]);
                Avx2.Store((dest + (i * bytesoftype) + (6 * sizeof(Vector256 <byte>))), ymm1[3]);
                Avx2.Store((dest + (i * bytesoftype) + (7 * sizeof(Vector256 <byte>))), ymm1[7]);
                Avx2.Store((dest + (i * bytesoftype) + (8 * sizeof(Vector256 <byte>))), ymm1[8]);
                Avx2.Store((dest + (i * bytesoftype) + (9 * sizeof(Vector256 <byte>))), ymm1[12]);
                Avx2.Store((dest + (i * bytesoftype) + (10 * sizeof(Vector256 <byte>))), ymm1[10]);
                Avx2.Store((dest + (i * bytesoftype) + (11 * sizeof(Vector256 <byte>))), ymm1[14]);
                Avx2.Store((dest + (i * bytesoftype) + (12 * sizeof(Vector256 <byte>))), ymm1[9]);
                Avx2.Store((dest + (i * bytesoftype) + (13 * sizeof(Vector256 <byte>))), ymm1[13]);
                Avx2.Store((dest + (i * bytesoftype) + (14 * sizeof(Vector256 <byte>))), ymm1[11]);
                Avx2.Store((dest + (i * bytesoftype) + (15 * sizeof(Vector256 <byte>))), ymm1[15]);
            }
        }