Beispiel #1
0
        private static bool TestSse41X64Extract_UInt64()
        {
            Vector128 <ulong> val    = Vector128.CreateScalar(ulong.MaxValue);
            ulong             result = Sse41.X64.Extract(val, 0);

            return(AreEqual(ulong.MaxValue, result));
        }
Beispiel #2
0
        private static bool TestSseX64ConvertToInt64WithTruncation()
        {
            Vector128 <float> val = Vector128.CreateScalar((float)long.MaxValue);
            long result           = Sse.X64.ConvertToInt64WithTruncation(val);

            return(AreEqual(long.MinValue, result));
        }
Beispiel #3
0
        private static bool TestSse2X64ConvertToInt64_Vector128Double()
        {
            Vector128 <double> val = Vector128.CreateScalar((double)long.MaxValue);
            long result            = Sse2.X64.ConvertToInt64(val);

            return(AreEqual(long.MinValue, result));
        }
Beispiel #4
0
        private static bool TestSse2X64ConvertToUInt64()
        {
            Vector128 <ulong> val    = Vector128.CreateScalar(ulong.MaxValue);
            ulong             result = Sse2.X64.ConvertToUInt64(val);

            return(AreEqual(ulong.MaxValue, result));
        }
Beispiel #5
0
        private static ushort ToUshort(int a)
        {
            var xmm = Vector128.CreateScalar(a);

            return(Sse41.PackUnsignedSaturate(xmm, xmm)
                   .GetElement(0));
        }
Beispiel #6
0
        private static unsafe void CalculateRow(int *previousRowPtr, char *targetPtr, int targetLength, char sourcePrevChar, int lastInsertionCost, int lastSubstitutionCost)
        {
            var columnIndex = 0;
            int lastDeletionCost;
            int localCost;

            var rowColumnsRemaining = targetLength;

            while (rowColumnsRemaining > 0)
            {
                rowColumnsRemaining--;

                localCost        = lastSubstitutionCost;
                lastDeletionCost = previousRowPtr[columnIndex];
                if (sourcePrevChar != targetPtr[columnIndex])
                {
                    localCost = Sse41.Min(
                        Vector128.CreateScalar(localCost),
                        Sse41.Min(Vector128.CreateScalar(lastInsertionCost),
                                  Vector128.CreateScalar(lastDeletionCost)))
                                .GetElement(0)
                    ;
                    localCost++;
                }
                lastInsertionCost             = localCost;
                previousRowPtr[columnIndex++] = localCost;
                lastSubstitutionCost          = lastDeletionCost;
            }
        }
Beispiel #7
0
        private static unsafe void CalculateRow(int *previousRowPtr, char *targetPtr, int targetLength, char sourcePrevChar, int lastInsertionCost, int lastSubstitutionCost)
        {
            var columnIndex         = 0;
            var rowColumnsRemaining = targetLength;

            Vector128 <int> one            = Vector128.CreateScalar(1);
            Vector128 <int> lastSubstition = Vector128.CreateScalar(lastSubstitutionCost);
            Vector128 <int> lastInsertion  = Vector128.CreateScalar(lastInsertionCost);
            Vector128 <int> localCost;
            Vector128 <int> lastDeletion;

            while (rowColumnsRemaining > 0)
            {
                rowColumnsRemaining--;

                localCost    = lastSubstition;
                lastDeletion = Vector128.CreateScalar(previousRowPtr[columnIndex]);
                if (sourcePrevChar != targetPtr[columnIndex])
                {
                    localCost = Sse2.Add(one,
                                         Sse41.Min(localCost,
                                                   Sse41.Min(lastInsertion, lastDeletion)));
                }
                lastInsertion = localCost;
                previousRowPtr[columnIndex++] = localCost.GetElement(0);
                lastSubstition = lastDeletion;
            }
        }
Beispiel #8
0
 private static int Test()
 {
     return(Sse2.Subtract( // LLVM is able to fold constant vectors :p
                Vector128 <int> .Zero,
                Vector128.CreateScalar(42))
            .ToScalar());
 }
        private static unsafe void CalculateRow(int *previousRowPtr, char *targetPtr, int targetLength, char sourcePrevChar, int lastInsertionCost, int lastSubstitutionCost)
        {
            for (int columnIndex = 0; columnIndex < targetLength; ++columnIndex)
            {
                int localCost        = lastSubstitutionCost;
                int lastDeletionCost = previousRowPtr[columnIndex];
                if (sourcePrevChar != targetPtr[columnIndex])
                {
                    localCost = Sse41.Min(
                        Vector128.CreateScalar(localCost),
                        Sse41.Min(Vector128.CreateScalar(lastInsertionCost),
                                  Vector128.CreateScalar(lastDeletionCost)))
                                .GetElement(0)
                    ;



                    //	localCost = Math.Min(lastInsertionCost, localCost);
                    //	localCost = Math.Min(lastDeletionCost, localCost);
                    localCost++;
                }
                lastInsertionCost           = localCost;
                previousRowPtr[columnIndex] = localCost;
                lastSubstitutionCost        = lastDeletionCost;
            }
        }
Beispiel #10
0
 static void TestExplicitFmaUsage6(ref Vector128 <float> a, float b)
 {
     CompareFloats(ReferenceMultiplyAdd(b, b, b),
                   Fma.MultiplyAdd(
                       Vector128.CreateScalarUnsafe(b),
                       Vector128.CreateScalar(b),
                       Vector128.Create(b)).ToScalar());
 }
Beispiel #11
0
 static void TestExplicitFmaUsage6(ref Vector128 <double> a, double b)
 {
     CompareDoubles(ReferenceMultiplyAdd(b, b, b),
                    Fma.MultiplyAdd(
                        Vector128.CreateScalarUnsafe(b),
                        Vector128.CreateScalar(b),
                        Vector128.Create(b)).ToScalar());
 }
        private unsafe bool TryParseInt(long input, out int value)
        {
            var vector = input - ShortCharA;
            var r      = (vector & ShortN15) == 0;

            vector = (long)((((ulong)vector) << 4) | (((ulong)vector) >> 8));
            value  = Sse41.Extract(Ssse3.Shuffle(Vector128.CreateScalar(vector).AsSByte(), NShuffleMask).AsInt32(), 0);
            return(r);
        }
        public void RunBasicScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario));

            SByte             value  = TestLibrary.Generator.GetSByte();
            Vector128 <SByte> result = Vector128.CreateScalar(value);

            ValidateResult(result, value);
        }
        private static unsafe void FillRowSSe(ushort *previousRow, int length)
        {
            var one = Vector128.CreateScalar((ushort)1);
            var j   = one;

            for (int i = 0; i < length; ++i)
            {
                previousRow[i] = j.GetElement(0);
                j = Sse42.AddSaturate(j, one);
            }
        }
Beispiel #15
0
    /// <summary>Creates a matrix from a the specified rotation around the z-axis.</summary>
    /// <param name="rotationZ">A float representing the rotation around the z-axis for the matrix.</param>
    /// <returns>A matrix that represents <paramref name="rotationZ" />.</returns>
    public static Matrix4x4 CreateFromRotationZ(float rotationZ)
    {
        var(sin, cos) = SinCos(rotationZ);
        var tmp = InterleaveLower(Vector128.CreateScalar(cos), Vector128.CreateScalar(sin));

        return(Create(
                   tmp,
                   Multiply(CreateFromYXZW(tmp), Vector128.Create(-1.0f, 1.0f, 1.0f, 1.0f)),
                   UnitZ,
                   UnitW
                   ));
    }
Beispiel #16
0
    /// <summary>Creates a matrix from a the specified rotation around the y-axis.</summary>
    /// <param name="rotationY">A float representing the rotation around the y-axis for the matrix.</param>
    /// <returns>A matrix that represents <paramref name="rotationY" />.</returns>
    public static Matrix4x4 CreateFromRotationY(float rotationY)
    {
        var(sin, cos) = SinCos(rotationY);
        var tmp = CreateFromXWAD(Vector128.CreateScalar(sin), Vector128.CreateScalar(cos));

        return(Create(
                   Multiply(CreateFromZYXW(tmp), Vector128.Create(1.0f, 1.0f, -1.0f, 1.0f)),
                   UnitY,
                   tmp,
                   UnitW
                   ));
    }
Beispiel #17
0
    /// <summary>Creates a matrix from a the specified rotation around the x-axis.</summary>
    /// <param name="rotationX">A float representing the rotation around the x-axis for the matrix.</param>
    /// <returns>A matrix that represents <paramref name="rotationX" />.</returns>
    public static Matrix4x4 CreateFromRotationX(float rotationX)
    {
        var(sin, cos) = SinCos(rotationX);
        var tmp = CreateFromWXAD(Vector128.CreateScalar(cos), Vector128.CreateScalar(sin));

        return(Create(
                   UnitX,
                   tmp,
                   Multiply(CreateFromXZYW(tmp), Vector128.Create(1.0f, -1.0f, 1.0f, 1.0f)),
                   UnitW
                   ));
    }
Beispiel #18
0
        public void SplitFraction()
        {
            var rnd = new Random();

            for (var n = 0; n < Iterations; n++)
            {
                var d  = rnd.NextDouble() + rnd.Next(0, Int32.MaxValue - 1);
                var vd = Vector128.CreateScalar(d);
                Utils.SplitFraction(vd, out var vi, out var vf);

                var mi = (double)(int)d;
                var mf = d - mi;

                Assert.Equal(mi, vi.ToScalar());
                Assert.Equal(mf, vf.ToScalar());
            }
        }
Beispiel #19
0
 public static __m128 _mm_set_ss(float v) => Vector128.CreateScalar(v);
Beispiel #20
0
 private static ushort Min(ushort a, ushort b)
 {
     return(Sse41.Min(Vector128.CreateScalar(a),
                      Vector128.CreateScalar(b))
            .GetElement(0));
 }
        private static unsafe int CalculateDistance(string sourceString, int sourceLength, string targetString, int targetLength, int startIndex)
        {
            var                 arrayPool   = ArrayPool <ushort> .Shared;
            var                 pooledArray = arrayPool.Rent(targetLength);
            Span <ushort>       previousRow = pooledArray;
            ReadOnlySpan <char> source      = sourceString.AsSpan().Slice(startIndex, sourceLength);
            ReadOnlySpan <char> target      = targetString.AsSpan().Slice(startIndex, targetLength);

            //ArrayPool values are sometimes bigger than allocated, let's trim our span to exactly what we use
            previousRow = previousRow.Slice(0, targetLength);

            fixed(char *targetPtr = target)
            fixed(char *srcPtr           = source)
            fixed(ushort *previousRowPtr = previousRow)
            {
                FillRow(previousRowPtr, targetLength);

                var rowIndex = 0;

                for (; rowIndex < sourceLength - 7; rowIndex += 8)
                {
                    // todo max
                    var temp = Vector128.Create(rowIndex);
                    var diag = Sse42.PackUnsignedSaturate(temp, temp);
                    var one  = Vector128.Create((ushort)1);
                    var left = Sse42.AddSaturate(diag, one);

                    var sourceV = Sse42.LoadVector128((ushort *)(srcPtr + rowIndex));
                    var targetV = Vector128 <ushort> .Zero;

                    var shift = Vector128.CreateScalar(ushort.MaxValue);
                    // First 3  iterations fills the vector
                    for (int columnIndex = 0; columnIndex < 7; columnIndex++)
                    {
                        // Shift in the next character
                        targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 2);
                        targetV = Sse42.Insert(targetV, (ushort)targetPtr[columnIndex], 0);

                        // Insert "(rowIndex + columnIndex + 1)" from the left
                        var leftValue = Vector128.Create(rowIndex + columnIndex + 1);
                        left  = Sse42.Or(Sse42.And(shift, Sse42.PackUnsignedSaturate(leftValue, leftValue)), left);
                        shift = Sse42.ShiftLeftLogical128BitLane(shift, 2);

                        // compare source to target
                        // alternativ, compare equal and OR with One
                        var match = Sse42.CompareEqual(sourceV, targetV);
                        var add   = Sse42.AndNot(match, one);
                        var next  = Sse42.AddSaturate(diag, add);

                        // Create next diag which is current up
                        var up = Sse42.ShiftLeftLogical128BitLane(left, 2);
                        up = Sse42.Insert(up, (ushort)previousRowPtr[columnIndex], 0);

                        var tmp = Sse42.AddSaturate(Sse42.Min(left, up), one);
                        next = Sse42.Min(next, tmp);

                        left = next;
                        diag = up;
                    }

                    previousRowPtr[0] = Sse42.Extract(left, 7);
                    var writePtr = previousRowPtr + 1;
                    for (int columnIndex = 8; columnIndex < targetLength; columnIndex++)
                    {
                        // Shift in the next character
                        targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 2);
                        targetV = Sse42.Insert(targetV, (ushort)targetPtr[columnIndex], 0);

                        // compare source to target
                        // alternativ, compare equal and OR with One
                        var match = Sse42.CompareEqual(sourceV, targetV);
                        var add   = Sse42.AndNot(match, one);
                        var next  = Sse42.AddSaturate(diag, add);

                        // Create next diag which is current up
                        var up = Sse42.ShiftLeftLogical128BitLane(left, 2);
                        up = Sse42.Insert(up, (ushort)previousRowPtr[columnIndex], 0);

                        var tmp = Sse42.AddSaturate(Sse42.Min(left, up), one);
                        next = Sse42.Min(next, tmp);

                        left = next;
                        diag = up;

                        // Store one value
                        *writePtr = Sse42.Extract(next, 7);
                        writePtr = writePtr + 1;

                        // Store one value
                        //previousRowPtr[columnIndex - 7] = Sse42.Extract(next, 7);
                    }

                    // Finish with last 3 items, dont read any more chars just extract them
                    for (int i = targetLength - 7; i < previousRow.Length; i++)
                    {
                        // Shift in the next character
                        targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 2);

                        // compare source to target
                        // alternativ, compare equal and OR with One
                        var match = Sse42.CompareEqual(sourceV, targetV);
                        var add   = Sse42.AndNot(match, one);
                        var next  = Sse42.AddSaturate(diag, add);

                        // Create next diag which is current up
                        var up = Sse42.ShiftLeftLogical128BitLane(left, 2);

                        var tmp = Sse42.AddSaturate(Sse42.Min(left, up), one);
                        next = Sse42.Min(next, tmp);

                        left = next;
                        diag = up;
                        // Store one value
                        previousRowPtr[i] = Sse42.Extract(next, 7);
                    }

#if DEBUG
                    if (true)
                    {
                        Console.Write("prev values for row {0}:", rowIndex);
                        for (int i = 0; i < targetLength; ++i)
                        {
                            Console.Write("{0} ", previousRow[i]);
                        }
                        Console.WriteLine();
                    }
#endif
                }

                //Calculate Single Rows
                for (; rowIndex < sourceLength; rowIndex++)
                {
                    var lastSubstitutionCost = rowIndex;
                    var lastInsertionCost    = rowIndex + 1;
                    var sourcePrevChar       = source[rowIndex];
#if DEBUG
                    Console.Write("prev values for row {0}:", rowIndex);
                    for (int i = 0; i < targetLength; ++i)
                    {
                        Console.Write("{0} ", previousRow[i]);
                    }
                    Console.WriteLine();
#endif

                    CalculateRow(previousRowPtr, targetPtr, targetLength, sourcePrevChar, lastInsertionCost, lastSubstitutionCost);
                }
            }

            var result = previousRow[targetLength - 1];
            arrayPool.Return(pooledArray);
            return(result);
        }
Beispiel #22
0
        private static unsafe int CalculateDistance(string sourceString, int sourceLength, string targetString, int targetLength, int startIndex)
        {
            var                 arrayPool   = ArrayPool <int> .Shared;
            var                 pooledArray = arrayPool.Rent(targetLength);
            Span <int>          previousRow = pooledArray;
            ReadOnlySpan <char> source      = sourceString.AsSpan().Slice(startIndex, sourceLength);
            ReadOnlySpan <char> target      = targetString.AsSpan().Slice(startIndex, targetLength);

            //ArrayPool values are sometimes bigger than allocated, let's trim our span to exactly what we use
            previousRow = previousRow.Slice(0, targetLength);

            fixed(char *targetPtr = target)
            fixed(char *srcPtr        = source)
            fixed(int *previousRowPtr = previousRow)
            {
                FillRow(previousRowPtr, targetLength);

                var rowIndex = 0;

                for (; rowIndex < sourceLength - 3; rowIndex += 4)
                {
                    var diag = Vector128.Create(rowIndex);
                    var left = Vector128.Create(rowIndex + 1);

                    var sourceV = Sse42.ConvertToVector128Int32((short *)(srcPtr + rowIndex));
                    var targetV = Vector128 <int> .Zero;
                    var one     = Vector128.Create(1);

                    // First 3  iterations fills the vector
                    var shift = Vector128.CreateScalar(-1);
                    for (int columnIndex = 0; columnIndex < 4; columnIndex++)
                    {
                        // Shift in the next character
                        targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 4);
                        targetV = Sse42.Insert(targetV, (short)targetPtr[columnIndex], 0);

                        //left = Sse42.Insert(left, rowIndex + columnIndex + 1, (byte)columnIndex);
                        var leftValue = Vector128.Create(rowIndex + columnIndex + 1);
                        left  = Sse42.Or(Sse42.And(shift, leftValue), left);
                        shift = Sse42.ShiftLeftLogical128BitLane(shift, 4);

                        // compare source to target
                        // alternativ, compare equal and OR with One
                        var match = Sse.CompareNotEqual(sourceV.AsSingle(), targetV.AsSingle());
                        var next  = Sse42.Subtract(diag, match.AsInt32());

                        // Create next diag which is current up
                        var up = Sse42.ShiftLeftLogical128BitLane(left, 4);
                        up = Sse42.Insert(up, previousRowPtr[columnIndex], 0);

                        var tmp = Sse42.Add(Sse42.Min(left, up), one);
                        next = Sse42.Min(next, tmp);

                        left = next;
                        diag = up;
                    }

                    previousRowPtr[0] = Sse42.Extract(left, 3);
                    for (int columnIndex = 4; columnIndex < targetLength; columnIndex++)
                    {
                        // Shift in the next character
                        targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 4);
                        targetV = Sse42.Insert(targetV, (short)targetPtr[columnIndex], 0);

                        // compare source to target
                        // alternativ, compare equal and OR with One
                        var match = Sse42.CompareNotEqual(sourceV.AsSingle(), targetV.AsSingle());
                        var next  = Sse42.Subtract(diag, match.AsInt32());

                        // Create next diag which is current up
                        var up = Sse42.ShiftLeftLogical128BitLane(left, 4);
                        up = Sse42.Insert(up, previousRowPtr[columnIndex], 0);

                        var tmp = Sse42.Add(Sse42.Min(left, up), one);
                        next = Sse42.Min(next, tmp);

                        left = next;
                        diag = up;

                        // Store one value
                        previousRowPtr[columnIndex - 3] = Sse42.Extract(next, 3);
                    }

                    // Finish with last 3 items, dont read any more chars just extract them
                    for (int i = targetLength - 3; i < targetLength; i++)
                    {
                        // Shift in the next character
                        targetV = Sse42.ShiftLeftLogical128BitLane(targetV, 4);

                        // compare source to target
                        // alternativ, compare equal and OR with One
                        var match = Sse.CompareNotEqual(sourceV.AsSingle(), targetV.AsSingle());
                        var next  = Sse42.Subtract(diag, match.AsInt32());

                        // Create next diag which is current up
                        var up = Sse42.ShiftLeftLogical128BitLane(left, 4);

                        var tmp = Sse42.Add(Sse42.Min(left, up), one);
                        next = Sse42.Min(next, tmp);

                        left = next;
                        diag = up;
                        // Store one value
                        previousRowPtr[i] = Sse42.Extract(next, 3);
                    }

#if DEBUG
                    if (true)
                    {
                        Console.Write("prev values for row {0}:", rowIndex);
                        for (int i = 0; i < targetLength; ++i)
                        {
                            Console.Write("{0} ", previousRow[i]);
                        }
                        Console.WriteLine();
                    }
#endif
                }

                //Calculate Single Rows
                for (; rowIndex < sourceLength; rowIndex++)
                {
                    var lastSubstitutionCost = rowIndex;
                    var lastInsertionCost    = rowIndex + 1;
                    var sourcePrevChar       = source[rowIndex];
#if DEBUG
                    Console.Write("prev values for row {0}:", rowIndex);
                    for (int i = 0; i < targetLength; ++i)
                    {
                        Console.Write("{0} ", previousRow[i]);
                    }
                    Console.WriteLine();
#endif

                    CalculateRow(previousRowPtr, targetPtr, targetLength, sourcePrevChar, lastInsertionCost, lastSubstitutionCost);
                }
            }

            var result = previousRow[targetLength - 1];
            arrayPool.Return(pooledArray);
            return(result);
        }
Beispiel #23
0
 private static int Min(int a, int b)
 {
     return(Sse41.Min(Vector128.CreateScalar(a),
                      Vector128.CreateScalar(b))
            .GetElement(0));
 }
Beispiel #24
0
    public void ResizeBicubic(FastBitmap rtnImage)
    {
        float scaleX = (float)this.width / rtnImage.width;
        float scaleY = (float)this.height / rtnImage.height;

        if (scaleX > 1 || scaleY > 1)
        {
            throw new Exception("拡大のみ対応");
        }

        float[] tmpa = new float[rtnImage.width * 4 * this.height];
        fixed(float *tmpp = tmpa)
        {
            float *tmp     = tmpp;
            var    _00mask = Vector128.Create(0, 255, 255, 255, 1, 255, 255, 255, 2, 255, 255, 255, 3, 255, 255, 255);
            var    _01mask = Vector128.Create(4, 255, 255, 255, 5, 255, 255, 255, 6, 255, 255, 255, 7, 255, 255, 255);
            var    _10mask = Vector128.Create(8, 255, 255, 255, 9, 255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255);
            var    _11mask = Vector128.Create(12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255);
            var    _vmask  = Vector128.Create(0, 4, 8, 12, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255);

            var _1012  = Vector128.Create(-1, 0, 1, 2);
            var _0123i = Vector128.Create(0, 1, 2, 3);

            var _0000   = Vector128.Create(0, 0, 0, 0);
            var _0000f  = Vector128.Create(0f, 0, 0, 0);
            var _255f   = Vector128.Create(255f, 255, 255, 255);
            var _1111   = Vector128.Create(1, 1, 1, 1);
            var _1111f  = Vector128.Create(1f, 1, 1, 1);
            var _4444f  = Vector128.Create(4f, 4, 4, 4);
            var _4444   = Vector128.Create(4, 4, 4, 4);
            var _5555f  = Vector128.Create(5f, 5, 5, 5);
            var _2222f  = Vector128.Create(2f, 2, 2, 2);
            var _8888f  = Vector128.Create(8f, 8, 8, 8);
            var _7f     = Vector128.Create(0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff).AsSingle();
            var _ff     = Vector128.Create(-1, -1, -1, -1);
            var _stride = Vector128.Create(rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4, rtnImage.width * 4);

            Parallel.For(0, this.height, (y) =>
            {
                float py      = (y * scaleY);
                float *tmpPos = tmp + y * rtnImage.width * 4;
                for (int x = 0; x < rtnImage.width; x++)
                {
                    float px = (x * scaleX);
                    int sx   = (int)px;

                    var _px = Vector128.CreateScalar(px);
                    _px     = Sse.Shuffle(_px, _px, 0);

                    var _sx = Vector128.CreateScalar(sx);
                    _sx     = Sse2.Shuffle(_sx, 0);

                    var _width = Vector128.CreateScalar(this.width);
                    _width     = Sse2.Shuffle(_width, 0);

                    var _x2 = Sse2.Add(_sx, _1012);

                    var _d  = Sse.And(Sse.Subtract(_px, Sse2.ConvertToVector128Single(_x2)), _7f);
                    var _d2 = Sse.Multiply(_d, _d);
                    var _d3 = Sse.Multiply(_d2, _d);

                    var w1   = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2)));
                    var w2   = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3);
                    var wb   = Sse2.CompareGreaterThan(_d, _1111f);
                    var _w   = Sse41.BlendVariable(w1, w2, wb);
                    var _xpb = Sse2.Or(Sse2.CompareLessThan(_x2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_x2, _width), _1111).AsInt32(), _ff));
                    var _xpp = Sse2.And(_sx, _xpb);
                    var _xp  = Sse41.BlendVariable(_x2, _xpp, _xpb);

                    var p = Avx2.GatherVector128((uint *)(this._ptr + this._stride * y), _xp, 4).AsByte();


                    var _p0 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _00mask).AsInt32());
                    var _p1 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _01mask).AsInt32());
                    var _p2 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _10mask).AsInt32());
                    var _p3 = Sse2.ConvertToVector128Single(Ssse3.Shuffle(p, _11mask).AsInt32());

                    var _w0 = Sse.Shuffle(_w, _w, 0);
                    var _w1 = Sse.Shuffle(_w, _w, 0b01010101);
                    var _w2 = Sse.Shuffle(_w, _w, 0b10101010);
                    var _w3 = Sse.Shuffle(_w, _w, 0b11111111);

                    var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3)));

                    Sse2.Store(tmpPos + x * 4, rgbaf);
                }
            });

            Parallel.For(0, rtnImage.height, (y) =>
            {
                float py = (y * scaleY);
                int sy   = (int)py;

                uint *store = stackalloc uint[4];

                var _py = Vector128.CreateScalar(py);
                _py     = Sse.Shuffle(_py, _py, 0);

                var _sy = Vector128.CreateScalar(sy);
                _sy     = Sse2.Shuffle(_sy, 0);

                var _height = Vector128.CreateScalar(this.height);
                _height     = Sse2.Shuffle(_height, 0);

                var _y2 = Sse2.Add(_sy, _1012);

                var _d  = Sse.And(Sse.Subtract(_py, Sse2.ConvertToVector128Single(_y2)), _7f);
                var _d2 = Sse.Multiply(_d, _d);
                var _d3 = Sse.Multiply(_d2, _d);

                var w1 = Sse.Add(_1111f, Sse.Subtract(_d3, Sse.Multiply(_2222f, _d2)));
                var w2 = Sse.Subtract(Sse.Subtract(Sse.Add(_4444f, Sse.Multiply(_5555f, _d2)), Sse.Multiply(_d, _8888f)), _d3);
                var wb = Sse2.CompareGreaterThan(_d, _1111f);
                var _w = Sse41.BlendVariable(w1, w2, wb);


                var _ypb = Sse2.Or(Sse2.CompareLessThan(_y2, _0000), Sse41.MultiplyLow(Sse2.AndNot(Sse2.CompareLessThan(_y2, _height), _1111).AsInt32(), _ff));
                var _ypp = Sse2.And(_sy, _ypb);
                var _yp  = Sse41.BlendVariable(_y2, _ypp, _ypb);
                var _yps = Sse41.MultiplyLow(_yp, _stride);

                var _yp0  = Sse2.Add(Sse2.Shuffle(_yps, 0), _0123i);
                var _yp1  = Sse2.Add(Sse2.Shuffle(_yps, 0b01010101), _0123i);
                var _yp2  = Sse2.Add(Sse2.Shuffle(_yps, 0b10101010), _0123i);
                var _yp3  = Sse2.Add(Sse2.Shuffle(_yps, 0b11111111), _0123i);
                uint *rtn = (uint *)(rtnImage._ptr + rtnImage._stride * y);

                for (int x = 0; x < rtnImage.width; x++)
                {
                    var _p0 = Avx2.GatherVector128((float *)(tmp), _yp0, 4);
                    var _p1 = Avx2.GatherVector128((float *)(tmp), _yp1, 4);
                    var _p2 = Avx2.GatherVector128((float *)(tmp), _yp2, 4);
                    var _p3 = Avx2.GatherVector128((float *)(tmp), _yp3, 4);

                    var _w0 = Sse.Shuffle(_w, _w, 0);
                    var _w1 = Sse.Shuffle(_w, _w, 0b01010101);
                    var _w2 = Sse.Shuffle(_w, _w, 0b10101010);
                    var _w3 = Sse.Shuffle(_w, _w, 0b11111111);

                    var rgbaf = Sse.Add(Sse.Add(Sse.Multiply(_p0, _w0), Sse.Multiply(_p1, _w1)), Sse.Add(Sse.Multiply(_p2, _w2), Sse.Multiply(_p3, _w3)));

                    var _b0 = Sse.CompareLessThan(rgbaf, _0000f);
                    rgbaf   = Sse41.BlendVariable(rgbaf, _0000f, _b0);
                    var _b1 = Sse.CompareGreaterThan(rgbaf, _255f);
                    rgbaf   = Sse41.BlendVariable(rgbaf, _255f, _b1);

                    var rgbab = Sse2.ConvertToVector128Int32(rgbaf).AsByte();
                    var rgba  = Ssse3.Shuffle(rgbab, _vmask).AsUInt32();

                    Sse2.Store(store, rgba);

                    _yp0 = Sse2.Add(_yp0, _4444);
                    _yp1 = Sse2.Add(_yp1, _4444);
                    _yp2 = Sse2.Add(_yp2, _4444);
                    _yp3 = Sse2.Add(_yp3, _4444);
                    *rtn = *store;
                    rtn++;
                }
            });
        public Intro()
        {
            var middleVector = Vector128.Create(1.0f);                      // middleVector = <1,1,1,1>

            middleVector = Vector128.CreateScalar(-1.0f);                   // middleVector = <-1,0,0,0>
            var floatBytes = Vector64.AsByte(Vector64.Create(1.0f, -1.0f)); // floatBytes = <0, 0, 128, 63, 0, 0, 128, 191>

            if (Avx.IsSupported)
            {
                var left  = Vector256.Create(-2.5f);                     // <-2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5, -2.5>
                var right = Vector256.Create(5.0f);                      // <5, 5, 5, 5, 5, 5, 5, 5>
                Vector256 <float> result = Avx.AddSubtract(left, right); // result = <-7.5, 2.5, -7.5, 2.5, -7.5, 2.5, -7.5, 2.5>xit
                left   = Vector256.Create(-1.0f, -2.0f, -3.0f, -4.0f, -50.0f, -60.0f, -70.0f, -80.0f);
                right  = Vector256.Create(0.0f, 2.0f, 3.0f, 4.0f, 50.0f, 60.0f, 70.0f, 80.0f);
                result = Avx.UnpackHigh(left, right);              // result = <-3, 3, -4, 4, -70, 70, -80, 80>
                result = Avx.UnpackLow(left, right);               // result = <-1, 1, -2, 2, -50, 50, -60, 60>
                result = Avx.DotProduct(left, right, 0b1111_0001); // result = <-30, 0, 0, 0, -17400, 0, 0, 0>
                bool testResult = Avx.TestC(left, right);          // testResult = true
                testResult = Avx.TestC(right, left);               // testResult = false
                Vector256 <float> result1 = Avx.Divide(left, right);
                var plusOne = Vector256.Create(1.0f);
                result = Avx.Compare(right, result1, FloatComparisonMode.OrderedGreaterThanNonSignaling);
                result = Avx.Compare(right, result1, FloatComparisonMode.UnorderedNotLessThanNonSignaling);
                left   = Vector256.Create(0.0f, 3.0f, -3.0f, 4.0f, -50.0f, 60.0f, -70.0f, 80.0f);
                right  = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                Vector256 <float> nanInFirstPosition = Avx.Divide(left, right);
                left = Vector256.Create(1.1f, 3.3333333f, -3.0f, 4.22f, -50.0f, 60.0f, -70.0f, 80.0f);
                Vector256 <float> InfInFirstPosition = Avx.Divide(left, right);

                left  = Vector256.Create(-1.1f, 3.0f, 1.0f / 3.0f, MathF.PI, -50.0f, 60.0f, -70.0f, 80.0f);
                right = Vector256.Create(0.0f, 2.0f, 3.1f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                Vector256 <float> compareResult = Avx.Compare(left, right, FloatComparisonMode.OrderedGreaterThanNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN>
                Vector256 <float> mixed         = Avx.BlendVariable(left, right, compareResult);                                //  mixed = <-1, 2, -3, 2, -50, -60, -70, -80>

                //left = Vector256.Create(-1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f, -1.0f, 1.0f);
                //right = Vector256.Create(1.0f, 1.0f, -1.0f, 1.0f, 1.0f, 1.0f, -1.0f, 1.0f);
                Vector256 <float> other = right = Vector256.Create(0.0f, 2.0f, 3.0f, 2.0f, 50.0f, -60.0f, 70.0f, -80.0f);
                bool bRes    = Avx.TestZ(plusOne, compareResult);
                bool bRes2   = Avx.TestC(plusOne, compareResult);
                bool allTrue = !Avx.TestZ(compareResult, compareResult);
                compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.OrderedEqualNonSignaling); // compareResult = <0, NaN, 0, NaN, 0, NaN, 0, NaN>
                compareResult = Avx.Compare(nanInFirstPosition, right, FloatComparisonMode.UnorderedEqualNonSignaling);
                compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.UnorderedNotLessThanOrEqualNonSignaling);
                compareResult = Avx.Compare(InfInFirstPosition, right, FloatComparisonMode.OrderedGreaterThanNonSignaling);
                var left128  = Vector128.Create(1.0f, 2.0f, 3.0f, 4.0f);
                var right128 = Vector128.Create(2.0f, 3.0f, 4.0f, 5.0f);
                Vector128 <float> compResult128 = Sse.CompareGreaterThan(left128, right128); // compResult128 = <0, 0, 0, 0>

                int res = Avx.MoveMask(compareResult);
                if (Fma.IsSupported)
                {
                    Vector256 <float> resultFma = Fma.MultiplyAdd(left, right, other); // = left * right + other for each element
                    resultFma = Fma.MultiplyAddNegated(left, right, other);            // = -(left * right + other) for each element
                    resultFma = Fma.MultiplySubtract(left, right, other);              // = left * right - other for each element
                    Fma.MultiplyAddSubtract(left, right, other);                       // even elements (0, 2, ...) like MultiplyAdd, odd elements like MultiplySubtract
                }
                result = Avx.DotProduct(left, right, 0b1010_0001);                     // result = <-20, 0, 0, 0, -10000, 0, 0, 0>
                result = Avx.Floor(left);                                              // result = <-3, -3, -3, -3, -3, -3, -3, -3>
                result = Avx.Add(left, right);                                         // result = <2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5>
                result = Avx.Ceiling(left);                                            // result = <-2, -2, -2, -2, -2, -2, -2, -2>
                result = Avx.Multiply(left, right);                                    // result = <-12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5, -12.5>
                result = Avx.HorizontalAdd(left, right);                               // result = <-5, -5, 10, 10, -5, -5, 10, 10>
                result = Avx.HorizontalSubtract(left, right);                          // result = <0, 0, 0, 0, 0, 0, 0, 0>
                double[] someDoubles      = new double[] { 1.0, 3.0, -2.5, 7.5, 10.8, 0.33333 };
                double[] someOtherDoubles = new double[] { 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 };
                double[] someResult       = new double[someDoubles.Length];
                float[]  someFloats       = new float[] { 1, 2, 3, 4, 10, 20, 30, 40, 0 };
                float[]  someOtherFloats  = new float[] { 1, 1, 1, 1, 1, 1, 1, 1 };
                unsafe
                {
                    fixed(double *ptr = &someDoubles[1])
                    {
                        fixed(double *ptr2 = &someResult[0])
                        {
                            Vector256 <double> res2 = Avx.LoadVector256(ptr); // res2 = <3, -2.5, 7.5, 10.8>

                            Avx.Store(ptr2, res2);
                        }
                    }

                    fixed(float *ptr = &someFloats[0])
                    {
                        fixed(float *ptr2 = &someOtherFloats[0])
                        {
                            Vector256 <float> res2 = Avx.DotProduct(Avx.LoadVector256(ptr), Avx.LoadVector256(ptr2), 0b0001_0001);
                            //Avx.Store(ptr2, res2);
                        }
                    }
                }
            }
        }
Beispiel #26
0
        private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer)
        {
            uint s1 = adler & 0xFFFF;
            uint s2 = (adler >> 16) & 0xFFFF;

            // Process the data in blocks.
            uint length = (uint)buffer.Length;
            uint blocks = length / BlockSize;

            length -= blocks * BlockSize;

            fixed(byte *bufferPtr = &MemoryMarshal.GetReference(buffer))
            {
                fixed(byte *tapPtr = &MemoryMarshal.GetReference(Tap1Tap2))
                {
                    byte *localBufferPtr = bufferPtr;

                    // _mm_setr_epi8 on x86
                    Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr);
                    Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10));
                    Vector128 <byte>  zero = Vector128 <byte> .Zero;
                    var ones = Vector128.Create((short)1);

                    while (blocks > 0)
                    {
                        uint n = NMAX / BlockSize;  /* The NMAX constraint. */
                        if (n > blocks)
                        {
                            n = blocks;
                        }

                        blocks -= n;

                        // Process n blocks of data. At most NMAX data bytes can be
                        // processed before s2 must be reduced modulo BASE.
                        Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n);
                        Vector128 <uint> v_s2 = Vector128.CreateScalar(s2);
                        Vector128 <uint> v_s1 = Vector128 <uint> .Zero;

                        do
                        {
                            // Load 32 input bytes.
                            Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
                            Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);

                            // Add previous block byte sum to v_ps.
                            v_ps = Sse2.Add(v_ps, v_s1);

                            // Horizontally add the bytes for s1, multiply-adds the
                            // bytes by [ 32, 31, 30, ... ] for s2.
                            v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
                            Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                            v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());

                            v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
                            Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                            v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());

                            localBufferPtr += BlockSize;
                        }while (--n > 0);

                        v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                        // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                        const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
                        const byte S1032 = 0b0100_1110;  // A B C D -> C D A B

                        v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

                        s1 += v_s1.ToScalar();

                        v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                        v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

                        s2 = v_s2.ToScalar();

                        // Reduce.
                        s1 %= BASE;
                        s2 %= BASE;
                    }

                    if (length > 0)
                    {
                        HandleLeftOver(localBufferPtr, length, ref s1, ref s2);
                    }

                    return(s1 | (s2 << 16));
                }
            }
        }
Beispiel #27
0
        private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer)
        {
            uint s1 = adler & 0xFFFF;
            uint s2 = (adler >> 16) & 0xFFFF;

            // Process the data in blocks.
            const int BLOCK_SIZE = 1 << 5;

            uint length = (uint)buffer.Length;
            uint blocks = length / BLOCK_SIZE;

            length -= blocks * BLOCK_SIZE;

            int index = 0;

            fixed(byte *bufferPtr = buffer)
            fixed(byte *tapPtr = Tap1Tap2)
            {
                index += (int)blocks * BLOCK_SIZE;
                var localBufferPtr = bufferPtr;

                // _mm_setr_epi8 on x86
                Vector128 <sbyte> tap1 = Sse2.LoadVector128((sbyte *)tapPtr);
                Vector128 <sbyte> tap2 = Sse2.LoadVector128((sbyte *)(tapPtr + 0x10));
                Vector128 <byte>  zero = Vector128 <byte> .Zero;
                var ones = Vector128.Create((short)1);

                while (blocks > 0)
                {
                    uint n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
                    if (n > blocks)
                    {
                        n = blocks;
                    }

                    blocks -= n;

                    // Process n blocks of data. At most NMAX data bytes can be
                    // processed before s2 must be reduced modulo BASE.
                    Vector128 <uint> v_ps = Vector128.CreateScalar(s1 * n);
                    Vector128 <uint> v_s2 = Vector128.CreateScalar(s2);
                    Vector128 <uint> v_s1 = Vector128 <uint> .Zero;

                    do
                    {
                        // Load 32 input bytes.
                        Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
                        Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 0x10);

                        // Add previous block byte sum to v_ps.
                        v_ps = Sse2.Add(v_ps, v_s1);

                        // Horizontally add the bytes for s1, multiply-adds the
                        // bytes by [ 32, 31, 30, ... ] for s2.
                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsUInt32());
                        Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones).AsUInt32());

                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsUInt32());
                        Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones).AsUInt32());

                        localBufferPtr += BLOCK_SIZE;
                    }while (--n > 0);

                    v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                    // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                    const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
                    const byte S1032 = 0b0100_1110;  // A B C D -> C D A B

                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

                    s1 += v_s1.ToScalar();

                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

                    s2 = v_s2.ToScalar();

                    // Reduce.
                    s1 %= BASE;
                    s2 %= BASE;
                }

                if (length > 0)
                {
                    if (length >= 16)
                    {
                        s2 += s1 += localBufferPtr[0];
                        s2 += s1 += localBufferPtr[1];
                        s2 += s1 += localBufferPtr[2];
                        s2 += s1 += localBufferPtr[3];
                        s2 += s1 += localBufferPtr[4];
                        s2 += s1 += localBufferPtr[5];
                        s2 += s1 += localBufferPtr[6];
                        s2 += s1 += localBufferPtr[7];
                        s2 += s1 += localBufferPtr[8];
                        s2 += s1 += localBufferPtr[9];
                        s2 += s1 += localBufferPtr[10];
                        s2 += s1 += localBufferPtr[11];
                        s2 += s1 += localBufferPtr[12];
                        s2 += s1 += localBufferPtr[13];
                        s2 += s1 += localBufferPtr[14];
                        s2 += s1 += localBufferPtr[15];

                        localBufferPtr += 16;
                        length         -= 16;
                    }

                    while (length-- > 0)
                    {
                        s2 += s1 += *localBufferPtr++;
                    }

                    if (s1 >= BASE)
                    {
                        s1 -= BASE;
                    }

                    s2 %= BASE;
                }

                return(s1 | (s2 << 16));
            }
        }
Beispiel #28
0
        // Based on https://github.com/chromium/chromium/blob/master/third_party/zlib/adler32_simd.c
#if !NETSTANDARD2_0 && !NETSTANDARD2_1
        private static unsafe uint CalculateSse(uint adler, ReadOnlySpan <byte> buffer)
        {
            uint s1 = adler & 0xFFFF;
            uint s2 = (adler >> 16) & 0xFFFF;

            // Process the data in blocks.
            const int BLOCK_SIZE = 1 << 5;

            uint length = (uint)buffer.Length;
            uint blocks = length / BLOCK_SIZE;

            length -= blocks * BLOCK_SIZE;

            int index = 0;

            fixed(byte *bufferPtr = &buffer[0])
            {
                index += (int)blocks * BLOCK_SIZE;
                var localBufferPtr = bufferPtr;

                // _mm_setr_epi8 on x86
                var tap1 = Vector128.Create(32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17);
                var tap2 = Vector128.Create(16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1);
                Vector128 <byte> zero = Vector128 <byte> .Zero;
                var ones = Vector128.Create((short)1);

                while (blocks > 0)
                {
                    uint n = NMAX / BLOCK_SIZE;  /* The NMAX constraint. */
                    if (n > blocks)
                    {
                        n = blocks;
                    }

                    blocks -= n;

                    // Process n blocks of data. At most NMAX data bytes can be
                    // processed before s2 must be reduced modulo BASE.
                    Vector128 <int> v_ps = Vector128.CreateScalar(s1 * n).AsInt32();
                    Vector128 <int> v_s2 = Vector128.CreateScalar(s2).AsInt32();
                    Vector128 <int> v_s1 = Vector128 <int> .Zero;

                    do
                    {
                        // Load 32 input bytes.
                        Vector128 <byte> bytes1 = Sse3.LoadDquVector128(localBufferPtr);
                        Vector128 <byte> bytes2 = Sse3.LoadDquVector128(localBufferPtr + 16);

                        // Add previous block byte sum to v_ps.
                        v_ps = Sse2.Add(v_ps, v_s1);

                        // Horizontally add the bytes for s1, multiply-adds the
                        // bytes by [ 32, 31, 30, ... ] for s2.
                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes1, zero).AsInt32());
                        Vector128 <short> mad1 = Ssse3.MultiplyAddAdjacent(bytes1, tap1);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad1, ones));

                        v_s1 = Sse2.Add(v_s1, Sse2.SumAbsoluteDifferences(bytes2, zero).AsInt32());
                        Vector128 <short> mad2 = Ssse3.MultiplyAddAdjacent(bytes2, tap2);
                        v_s2 = Sse2.Add(v_s2, Sse2.MultiplyAddAdjacent(mad2, ones));

                        localBufferPtr += BLOCK_SIZE;
                    }while (--n > 0);

                    v_s2 = Sse2.Add(v_s2, Sse2.ShiftLeftLogical(v_ps, 5));

                    // Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
                    const byte S2301 = 0b1011_0001;  // A B C D -> B A D C
                    const byte S1032 = 0b0100_1110;  // A B C D -> C D A B

                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S2301));
                    v_s1 = Sse2.Add(v_s1, Sse2.Shuffle(v_s1, S1032));

                    s1 += (uint)v_s1.ToScalar();

                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S2301));
                    v_s2 = Sse2.Add(v_s2, Sse2.Shuffle(v_s2, S1032));

                    s2 = (uint)v_s2.ToScalar();

                    // Reduce.
                    s1 %= BASE;
                    s2 %= BASE;
                }
            }

            ref byte bufferRef = ref MemoryMarshal.GetReference(buffer);