예제 #1
0
        public static Vector128 <float> DotProduct4D(Vector128 <float> left, Vector128 <float> right)
        {
            if (Sse41.IsSupported)
            {
                // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_1111_1111;
                return(Sse41.DotProduct(left, right, control));
            }
            else if (Sse3.IsSupported)
            {
                // Multiply the two vectors to get all the needed elements
                Vector128 <float> mul = Sse.Multiply(left, right);

                // Double horizontal add is the same as broadcasting the sum of all 4
                mul = Sse3.HorizontalAdd(mul, mul);
                return(Sse3.HorizontalAdd(mul, mul));
            }
            else if (Sse.IsSupported)
            {
                Vector128 <float> copy = right;
                // Multiply the two vectors to get all the needed elements
                Vector128 <float> mul = Sse.Multiply(left, copy);

                copy = Sse.Shuffle(copy, mul, ShuffleValues.XXXY);
                copy = Sse.Add(copy, mul);
                mul  = Sse.Shuffle(mul, copy, ShuffleValues.XXWX);
                mul  = Sse.Add(mul, copy);

                return(Sse.Shuffle(mul, mul, ShuffleValues.ZZZZ));
            }

            return(DotProduct4D_Software(left, right));
        }
예제 #2
0
        public void RunFldScenario()
        {
            var result = Sse.Add(_fld1, _fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr);
        }
예제 #3
0
        public override ulong Run(CancellationToken cancellationToken)
        {
            if (!Sse.IsSupported)
            {
                return(0uL);
            }

            var randomFloatingSpan = new Span <float>(new[] { RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT });
            var dst        = new Span <float>(new float[4]);
            var iterations = 0uL;

            unsafe
            {
                fixed(float *pdst = dst)
                fixed(float *psrc = randomFloatingSpan)
                {
                    var srcVector = Sse.LoadVector128(psrc);
                    var dstVector = Sse.LoadVector128(pdst);

                    while (!cancellationToken.IsCancellationRequested)
                    {
                        for (var j = 0; j < LENGTH; j++)
                        {
                            dstVector = Sse.Add(dstVector, srcVector);
                        }

                        Sse.Store(pdst, dstVector);

                        iterations++;
                    }
                }
            }

            return(iterations);
        }
예제 #4
0
        public static VectorF DotProduct4D(VectorFParam1_3 left, VectorFParam1_3 right)
        {
            if (Sse41.IsSupported)
            {
                // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_1111_1111;
                return(Sse41.DotProduct(left, right, control));
            }
            else if (Sse3.IsSupported)
            {
                VectorF mul = Sse.Multiply(left, right);
                mul = Sse3.HorizontalAdd(mul, mul);
                return(Sse3.HorizontalAdd(mul, mul));
            }
            else if (Sse.IsSupported)
            {
                VectorF copy = right;
                VectorF mul  = Sse.Multiply(left, copy);
                copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0));
                copy = Sse.Add(copy, mul);
                mul  = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0));
                mul  = Sse.AddScalar(mul, copy);

                return(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2)));
            }

            return(DotProduct4D_Software(left, right));
        }
예제 #5
0
        public static Vector4F DistanceSquared4D(Vector4FParam1_3 left, Vector4FParam1_3 right)
        {
            if (Sse41.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);
                // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_1111_1111;
                return(Sse41.DotProduct(diff, diff, control));
            }
            else if (Sse3.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);
                Vector4F mul  = Sse.Multiply(diff, diff);
                mul = Sse3.HorizontalAdd(mul, mul);
                return(Sse3.HorizontalAdd(mul, mul));
            }
            else if (Sse.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);
                Vector4F copy = diff;
                Vector4F mul  = Sse.Multiply(diff, copy);
                copy = Sse.Shuffle(copy, mul, Helpers.Shuffle(1, 0, 0, 0));
                copy = Sse.Add(copy, mul);
                mul  = Sse.Shuffle(mul, copy, Helpers.Shuffle(0, 3, 0, 0));
                mul  = Sse.AddScalar(mul, copy);

                return(Sse.Shuffle(mul, mul, Helpers.Shuffle(2, 2, 2, 2)));
            }

            return(DistanceSquared4D_Software(left, right));
        }
예제 #6
0
        public static VectorF Normalize4D(VectorFParam1_3 vector)
        {
            if (Sse41.IsSupported)
            {
                // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_1111_1111;
                return(Sse.Divide(vector, Sse41.DotProduct(vector, vector, control)));
            }
            else if (Sse3.IsSupported)
            {
                VectorF mul = Sse.Multiply(vector, vector);
                mul = Sse3.HorizontalAdd(mul, mul);
                return(Sse.Divide(vector, Sse.Sqrt(Sse3.HorizontalAdd(mul, mul))));
            }
            else if (Sse.IsSupported)
            {
                VectorF copy = vector;
                VectorF mul  = Sse.Multiply(vector, copy);
                copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0));
                copy = Sse.Add(copy, mul);
                mul  = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0));
                mul  = Sse.AddScalar(mul, copy);

                return(Sse.Divide(vector, Sse.Sqrt(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2)))));
            }

            return(Normalize4D_Software(vector));
        }
예제 #7
0
            public void RunStructFldScenario(SimpleBinaryOpTest__AddSingle testClass)
            {
                var result = Sse.Add(_fld1, _fld2);

                Unsafe.Write(testClass._dataTable.outArrayPtr, result);
                testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr);
            }
예제 #8
0
        private static void AddSse(ReadOnlySpan <float> a, ReadOnlySpan <float> b, Span <float> s)
        {
            var remainder = a.Length & 3;
            var length    = a.Length - remainder;

            fixed(float *ptr = a)
            {
                fixed(float *ptrB = b)
                {
                    fixed(float *ptrS = s)
                    {
                        for (var i = 0; i < length; i += 4)
                        {
                            var j = Sse.LoadVector128(ptr + i);
                            var k = Sse.LoadVector128(ptrB + i);

                            Sse.Store(ptrS + i, Sse.Add(j, k));
                        }
                    }
                }
            }

            if (remainder != 0)
            {
                AddNaive(a, b, s, length, a.Length);
            }
        }
예제 #9
0
        private unsafe void Test44_Intrinsics_V128float_Sqrt(byte[] red, byte[] green, byte[] blue, float[] vv)
        {
            int    simdLength = Vector128 <float> .Count;
            int    lastIndex  = red.Length - (red.Length % simdLength);
            float *tp         = stackalloc float[simdLength];
            //var zero = Vector128<float>.Zero;
            var vm = Vector128 <float> .Zero;

            fixed(byte *pR = red, pG = green, pB = blue)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    var vr = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pG + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pR + i)));
                    var vg = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pB + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pG + i)));
                    var vb = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pR + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pB + i)));
                    vm = Sse.Add(Sse.Multiply(vr, vr), Sse.Multiply(vg, vg));
                    vm = Sse.Add(vm, Sse.Multiply(vb, vb));
                    vm = Sse.Sqrt(vm);

                    Sse.Store(tp, vm);
                    for (int m = 0; m < simdLength; m++)
                    {
                        vv[i + m] = tp[m];
                    }
                }
            }

            Amari(lastIndex, red.Length, red, green, blue, vv);
        }
예제 #10
0
        private static float HorizontalAddAvx(ReadOnlySpan <float> a)
        {
            var remainder = a.Length & 7;
            var length    = a.Length - remainder;

            var accumulator = Vector128.Create(0f);

            fixed(float *ptr = a)
            {
                for (var i = 0; i < length; i += 8)
                {
                    var j    = Avx.LoadVector256(ptr + i);
                    var x128 = Sse.Add(Avx.ExtractVector128(j, 0), Avx.ExtractVector128(j, 1));
                    accumulator = Sse3.HorizontalAdd(x128, accumulator);
                }
            }

            var sum = 0f;

            accumulator = Sse3.HorizontalAdd(Sse3.HorizontalAdd(accumulator, accumulator), accumulator);
            Sse.StoreScalar(&sum, accumulator);

            if (remainder != 0)
            {
                sum += HorizontalAddNaive(a, length, a.Length);
            }

            return(sum);
        }
예제 #11
0
        public static Vector128 <float> HorizontalAdd(Vector128 <float> left, Vector128 <float> right)
        {
            /*
             * return Vector128.Create(
             *     X(left) + Y(left),
             *     Z(left) + W(left),
             *     X(right) + Y(right),
             *     Z(right) + W(right)
             * );
             *
             * HorizontalAdd of A - (Ax, Ay, Az, Aw) and B - (Bx, By, Bz, Bw) is
             * (Ax + Ay, Az + Aw, Bx + By, Bz + Bw)
             *
             * So when we don't have hadd instruction, we can just use normal add after getting the vectors
             * (Ax, Az, Bx, Bz) and (Ay, Aw, By, Bw)
             *
             * We explicitly use the Sse methods here as this would be a slow way to do it on the software fallback
             */

            if (Sse3.IsSupported)
            {
                return(Sse3.HorizontalAdd(left, right));
            }

            if (Sse.IsSupported)
            {
                Vector128 <float> vector1 = Sse.Shuffle(left, right, ShuffleValues.XZXZ);
                Vector128 <float> vector2 = Sse.Shuffle(left, right, ShuffleValues.YWYW);

                return(Sse.Add(vector1, vector2));
            }

            return(HorizontalAdd_Software(left, right));
        }
예제 #12
0
        static unsafe int Main(string[] args)
        {
            int testResult = Pass;

            if (Sse.IsSupported)
            {
                using (TestTable <float> floatTable = new TestTable <float>(new float[4] {
                    1, -5, 100, 0
                }, new float[4] {
                    22, -1, -50, 0
                }, new float[4]))
                {
                    var vf1 = Unsafe.Read <Vector128 <float> >(floatTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <float> >(floatTable.inArray2Ptr);
                    var vf3 = Sse.Add(vf1, vf2);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => x + y == z))
                    {
                        Console.WriteLine("SSE Add failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }
            }


            return(testResult);
        }
예제 #13
0
        //↑をオーバーフローしない程度に配列を分割して計算
        private unsafe long Test28_Intrinsics_SSE41_DotProduct_float_MT_Kai(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count * 4;

            //集計用のVector128<float> vTotalで扱える最大要素数 = 1032
            //floatの仮数部24bit / byte型最大値 * byte型最大値
            //16777215 / (255 * 255) * 4 = 1032.0471 これの小数点以下切り捨てを
            //1区分あたりの要素数(分割サイズ)
            int rangeSize =
                ((1 << 24) - 1) / (byte.MaxValue * byte.MaxValue) * Vector128 <float> .Count;//1032

            Parallel.ForEach(
                Partitioner.Create(0, vs.Length, rangeSize),
                (range) =>
            {
                var vTotal    = Vector128 <float> .Zero;
                int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                fixed(byte *p = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i);
                        var vv            = Sse2.ConvertToVector128Single(v);
                        //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                        Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 4);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 8);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 12);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる
                        vTotal = Sse.Add(vTotal, dp);
                    }
                }
                long subtotal = 0;
                float *f      = stackalloc float[Vector128 <float> .Count];
                Sse.Store(f, vTotal);
                for (int i = 0; i < Vector128 <float> .Count; i++)
                {
                    subtotal += (long)f[i];
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
예제 #14
0
        public void RunLclFldScenario()
        {
            var test   = new SimpleBinaryOpTest__AddSingle();
            var result = Sse.Add(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
예제 #15
0
        public void RunStructLclFldScenario()
        {
            var test   = TestStruct.Create();
            var result = Sse.Add(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
예제 #16
0
        public void RunClassFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario));

            var result = Sse.Add(_fld1, _fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr);
        }
예제 #17
0
        public static Vector4F Add(Vector4FParam1_3 left, Vector4FParam1_3 right)
        {
            if (Sse.IsSupported)
            {
                return(Sse.Add(left, right));
            }

            return(SoftwareFallbacks.SoftwareFallbacksVector4F.Add_Software(left, right));
        }
예제 #18
0
        public void RunLclVarScenario_UnsafeRead()
        {
            var left   = Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr);
            var right  = Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr);
            var result = Sse.Add(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
예제 #19
0
        public static VectorF Add(VectorFParam1_3 left, VectorFParam1_3 right)
        {
            if (Sse.IsSupported)
            {
                return(Sse.Add(left, right));
            }

            return(Add_Software(left, right));
        }
예제 #20
0
        public void RunLclVarScenario_LoadAligned()
        {
            var left   = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr));
            var right  = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr));
            var result = Sse.Add(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
예제 #21
0
        public static Vector128 <float> Add(Vector128 <float> left, Vector128 <float> right)
        {
            if (Sse.IsSupported)
            {
                return(Sse.Add(left, right));
            }

            return(Add_Software(left, right));
        }
예제 #22
0
        public void RunClsVarScenario()
        {
            var result = Sse.Add(
                _clsVar1,
                _clsVar2
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr);
        }
예제 #23
0
        public void RunBasicScenario_LoadAligned()
        {
            var result = Sse.Add(
                Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr)),
                Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr))
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
예제 #24
0
        public void RunBasicScenario_UnsafeRead()
        {
            var result = Sse.Add(
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr),
                Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr)
                );

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr);
        }
예제 #25
0
        public void RunClassLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario));

            var test = new SimpleBinaryOpTest__AddSingle();
            var result = Sse.Add(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
예제 #26
0
        public void RunStructLclFldScenario()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario));

            var test = TestStruct.Create();
            var result = Sse.Add(test._fld1, test._fld2);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr);
        }
예제 #27
0
        public static VectorF Add(VectorFParam1_3 vector, float scalar)
        {
            if (Sse.IsSupported)
            {
                VectorF expand = Vector128.Create(scalar);
                return(Sse.Add(vector, expand));
            }

            return(Add_Software(vector, scalar));
        }
예제 #28
0
        public static Vector4F Subtract(Vector4FParam1_3 vector, float scalar)
        {
            if (Sse.IsSupported)
            {
                Vector4F expand = Vector128.Create(scalar);
                return(Sse.Add(vector, expand));
            }

            return(SoftwareFallbacks.SoftwareFallbacksVector4F.Subtract_Software(vector, scalar));
        }
예제 #29
0
        public void RunLclVarScenario_LoadAligned()
        {
            TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned));

            var left = Sse.LoadAlignedVector128((Single*)(_dataTable.inArray1Ptr));
            var right = Sse.LoadAlignedVector128((Single*)(_dataTable.inArray2Ptr));
            var result = Sse.Add(left, right);

            Unsafe.Write(_dataTable.outArrayPtr, result);
            ValidateResult(left, right, _dataTable.outArrayPtr);
        }
예제 #30
0
        static unsafe float fmaTest()
        {
            vec a;
            var b = Vector128.Create(1f);
            var c = Vector128.Create(2f);
            var d = Vector128.Create(3f);

            c = Fma.MultiplyAdd(Sse.LoadVector128((float *)&a), b, c);

            return(Sse.Add(c, d).ToScalar());
        }