Пример #1
0
        //↑をマルチスレッド化
        private unsafe long Test17_Intrinsics_SSE41_DotProduct_float_MT(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count;
            int  rangeSize  = vs.Length / Environment.ProcessorCount;

            Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize),
                             (range) =>
            {
                long subtotal = 0;
                int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                fixed(byte *p = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i);
                        var vv            = Sse2.ConvertToVector128Single(v);
                        //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                        Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                        //vTotal = Sse.Add(vTotal, dp);
                        subtotal += (long)dp.GetElement(0);
                    }
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
Пример #2
0
        public static Vector128 <float> DotProduct4D(Vector128 <float> left, Vector128 <float> right)
        {
            if (Sse41.IsSupported)
            {
                // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_1111_1111;
                return(Sse41.DotProduct(left, right, control));
            }
            else if (Sse3.IsSupported)
            {
                // Multiply the two vectors to get all the needed elements
                Vector128 <float> mul = Sse.Multiply(left, right);

                // Double horizontal add is the same as broadcasting the sum of all 4
                mul = Sse3.HorizontalAdd(mul, mul);
                return(Sse3.HorizontalAdd(mul, mul));
            }
            else if (Sse.IsSupported)
            {
                Vector128 <float> copy = right;
                // Multiply the two vectors to get all the needed elements
                Vector128 <float> mul = Sse.Multiply(left, copy);

                copy = Sse.Shuffle(copy, mul, ShuffleValues.XXXY);
                copy = Sse.Add(copy, mul);
                mul  = Sse.Shuffle(mul, copy, ShuffleValues.XXWX);
                mul  = Sse.Add(mul, copy);

                return(Sse.Shuffle(mul, mul, ShuffleValues.ZZZZ));
            }

            return(DotProduct4D_Software(left, right));
        }
Пример #3
0
        public static VectorF DotProduct4D(VectorFParam1_3 left, VectorFParam1_3 right)
        {
            if (Sse41.IsSupported)
            {
                // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_1111_1111;
                return(Sse41.DotProduct(left, right, control));
            }
            else if (Sse3.IsSupported)
            {
                VectorF mul = Sse.Multiply(left, right);
                mul = Sse3.HorizontalAdd(mul, mul);
                return(Sse3.HorizontalAdd(mul, mul));
            }
            else if (Sse.IsSupported)
            {
                VectorF copy = right;
                VectorF mul  = Sse.Multiply(left, copy);
                copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0));
                copy = Sse.Add(copy, mul);
                mul  = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0));
                mul  = Sse.AddScalar(mul, copy);

                return(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2)));
            }

            return(DotProduct4D_Software(left, right));
        }
Пример #4
0
        public static Vector4F DistanceSquared4D(Vector4FParam1_3 left, Vector4FParam1_3 right)
        {
            if (Sse41.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);
                // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_1111_1111;
                return(Sse41.DotProduct(diff, diff, control));
            }
            else if (Sse3.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);
                Vector4F mul  = Sse.Multiply(diff, diff);
                mul = Sse3.HorizontalAdd(mul, mul);
                return(Sse3.HorizontalAdd(mul, mul));
            }
            else if (Sse.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);
                Vector4F copy = diff;
                Vector4F mul  = Sse.Multiply(diff, copy);
                copy = Sse.Shuffle(copy, mul, Helpers.Shuffle(1, 0, 0, 0));
                copy = Sse.Add(copy, mul);
                mul  = Sse.Shuffle(mul, copy, Helpers.Shuffle(0, 3, 0, 0));
                mul  = Sse.AddScalar(mul, copy);

                return(Sse.Shuffle(mul, mul, Helpers.Shuffle(2, 2, 2, 2)));
            }

            return(DistanceSquared4D_Software(left, right));
        }
Пример #5
0
        public static Vector256 <double> DotProduct2D(Vector256 <double> left, Vector256 <double> right)
        {
            // SSE4.1 has a native dot product instruction, dppd
            if (Sse41.IsSupported)
            {
                // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_0011_1111;
                Vector2D   dp      = Sse41.DotProduct(left.GetLower(), right.GetLower(), control);

                return(Helpers.DuplicateToVector256(dp));
            }
            else if (Sse3.IsSupported)
            {
                Vector2D tmp = Sse2.Multiply(left.GetLower(), right.GetLower());
                return(Helpers.DuplicateToVector256(Sse3.HorizontalAdd(tmp, tmp)));
            }
            else if (Sse2.IsSupported)
            {
                Vector2D tmp  = Sse2.Multiply(left.GetLower(), right.GetLower());
                Vector2D shuf = Sse2.Shuffle(tmp, tmp, ShuffleValues.YXYX);

                var dot = Sse2.Add(tmp, shuf);

                return(dot.ToVector256Unsafe().WithUpper(dot));
            }

            return(DotProduct2D_Software(left, right));
        }
Пример #6
0
        public static VectorF Normalize4D(VectorFParam1_3 vector)
        {
            if (Sse41.IsSupported)
            {
                // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_1111_1111;
                return(Sse.Divide(vector, Sse41.DotProduct(vector, vector, control)));
            }
            else if (Sse3.IsSupported)
            {
                VectorF mul = Sse.Multiply(vector, vector);
                mul = Sse3.HorizontalAdd(mul, mul);
                return(Sse.Divide(vector, Sse.Sqrt(Sse3.HorizontalAdd(mul, mul))));
            }
            else if (Sse.IsSupported)
            {
                VectorF copy = vector;
                VectorF mul  = Sse.Multiply(vector, copy);
                copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0));
                copy = Sse.Add(copy, mul);
                mul  = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0));
                mul  = Sse.AddScalar(mul, copy);

                return(Sse.Divide(vector, Sse.Sqrt(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2)))));
            }

            return(Normalize4D_Software(vector));
        }
Пример #7
0
        //↑をオーバーフローしない程度に配列を分割して計算
        private unsafe long Test28_Intrinsics_SSE41_DotProduct_float_MT_Kai(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count * 4;

            //集計用のVector128<float> vTotalで扱える最大要素数 = 1032
            //floatの仮数部24bit / byte型最大値 * byte型最大値
            //16777215 / (255 * 255) * 4 = 1032.0471 これの小数点以下切り捨てを
            //1区分あたりの要素数(分割サイズ)
            int rangeSize =
                ((1 << 24) - 1) / (byte.MaxValue * byte.MaxValue) * Vector128 <float> .Count;//1032

            Parallel.ForEach(
                Partitioner.Create(0, vs.Length, rangeSize),
                (range) =>
            {
                var vTotal    = Vector128 <float> .Zero;
                int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength;
                fixed(byte *p = vs)
                {
                    for (int i = range.Item1; i < lastIndex; i += simdLength)
                    {
                        Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i);
                        var vv            = Sse2.ConvertToVector128Single(v);
                        //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                        Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 4);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 8);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる
                        vTotal = Sse.Add(vTotal, dp);

                        v      = Sse41.ConvertToVector128Int32(p + i + 12);
                        vv     = Sse2.ConvertToVector128Single(v);
                        dp     = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる
                        vTotal = Sse.Add(vTotal, dp);
                    }
                }
                long subtotal = 0;
                float *f      = stackalloc float[Vector128 <float> .Count];
                Sse.Store(f, vTotal);
                for (int i = 0; i < Vector128 <float> .Count; i++)
                {
                    subtotal += (long)f[i];
                }
                for (int i = lastIndex; i < range.Item2; i++)
                {
                    subtotal += vs[i] * vs[i];
                }
                System.Threading.Interlocked.Add(ref total, subtotal);
            });
            return(total);
        }
Пример #8
0
 public static unsafe Float4 operator *(Float4x4 a, Float4 b) =>
 new Float4
 {
     X = Sse41.DotProduct(Sse.LoadVector128(&a.M11), Sse.LoadVector128(&b.X), 0xFF).ToScalar(),
     Y = Sse41.DotProduct(Sse.LoadVector128(&a.M21), Sse.LoadVector128(&b.X), 0xFF).ToScalar(),
     Z = Sse41.DotProduct(Sse.LoadVector128(&a.M31), Sse.LoadVector128(&b.X), 0xFF).ToScalar(),
     W = Sse41.DotProduct(Sse.LoadVector128(&a.M41), Sse.LoadVector128(&b.X), 0xFF).ToScalar()
 };
Пример #9
0
        static unsafe float Length_Sse_V6_Helper(MyVector4 vec)
        {
            var mmx = *((Vector128 <float> *) & vec);

            mmx = Sse41.DotProduct(mmx, mmx, 0xF1);
            var l2 = mmx.GetElement(0);

            return(MathF.Sqrt(l2));
        }
Пример #10
0
        static unsafe float Length_Sse_V2_Helper(MyVector4 vec)
        {
            var ptr = (float *)&vec;
            var mmx = Sse.LoadVector128(ptr);

            mmx = Sse41.DotProduct(mmx, mmx, 0xF1);
            var l2 = mmx.GetElement(0);

            return(MathF.Sqrt(l2));
        }
Пример #11
0
        public static unsafe Float4 Normalize(Float4 a)
        {
            Vector128 <float> v = Sse.LoadVector128(&a.X);

            Sse.Store(&a.X,
                      Sse.Multiply(Sse.LoadVector128(&a.X),
                                   Sse.ReciprocalSqrt(Sse41.DotProduct(v, v, 0x7F))));

            return(a);
        }
        public static unsafe double DotProduct2DSSE(
            double x1, double y1,
            double x2, double y2)
        {
#if HAS_INTRINSICS
            if (Sse41.IsSupported)
            {
                var vf1 = Sse2.LoadVector128(&x1);
                var vf2 = Sse2.LoadVector128(&x2);
                Unsafe.Write(&x1, Sse41.DotProduct(vf1, vf2, 51));
                return(x1);
            }
#endif
            return((x1 * x2) + (y1 * y2));
        }
Пример #13
0
        //Intrinsics SSE41 DotProduct、ループの中で4個づつ処理
        private unsafe long Test8_Intrinsics_SSE41_DotProduct_float(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count * 4;
            int  lastIndex  = vs.Length - (vs.Length % simdLength);
            var  vTotal     = Vector128 <float> .Zero;

            fixed(byte *p = vs)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    Vector128 <int> v  = Sse41.ConvertToVector128Int32(p + i);
                    var             vv = Sse2.ConvertToVector128Single(v);
                    //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                    Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                    vTotal = Sse.Add(vTotal, dp);

                    v      = Sse41.ConvertToVector128Int32(p + i + 4);
                    vv     = Sse2.ConvertToVector128Single(v);
                    dp     = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる
                    vTotal = Sse.Add(vTotal, dp);

                    v      = Sse41.ConvertToVector128Int32(p + i + 8);
                    vv     = Sse2.ConvertToVector128Single(v);
                    dp     = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる
                    vTotal = Sse.Add(vTotal, dp);

                    v      = Sse41.ConvertToVector128Int32(p + i + 12);
                    vv     = Sse2.ConvertToVector128Single(v);
                    dp     = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる
                    vTotal = Sse.Add(vTotal, dp);
                }
            }

            float *f = stackalloc float[Vector128 <int> .Count];

            Sse.Store(f, vTotal);
            for (int i = 0; i < Vector128 <int> .Count; i++)
            {
                total += (long)f[i];
            }
            for (int i = lastIndex; i < vs.Length; i++)
            {
                total += vs[i] * vs[i];
            }
            return(total);
        }
Пример #14
0
        public static Vector4F DistanceSquared2D(Vector4FParam1_3 left, Vector4FParam1_3 right)
        {
            // SSE4.1 has a native dot product instruction, dpps
            if (Sse41.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);

                // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_0011_1111;
                return(Sse41.DotProduct(diff, diff, control));
            }
            // We can use SSE to vectorize the multiplication
            // There are different fastest methods to sum the resultant vector
            // on SSE3 vs SSE1
            else if (Sse3.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);

                Vector4F mul = Sse.Multiply(diff, diff);

                // Set W and Z to zero
                Vector4F result = Sse.And(mul, MaskWAndZToZero);

                // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?)
                result = Sse3.HorizontalAdd(result, result);

                // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z)
                return(Sse3.MoveLowAndDuplicate(result));
            }
            else if (Sse.IsSupported)
            {
                Vector4F diff = Sse.Subtract(left, right);

                Vector4F mul = Sse.Multiply(diff, diff);

                Vector4F temp = Sse.Shuffle(mul, mul, Helpers.Shuffle(1, 1, 1, 1));

                mul = Sse.AddScalar(mul, temp);

                mul = Sse.Shuffle(mul, mul, Helpers.Shuffle(0, 0, 0, 0));

                return(mul);
            }

            return(DistanceSquared2D_Software(left, right));
        }
Пример #15
0
        public unsafe int ParseSIMD()
        {
            var tmp  = Sse2.LoadVector128(ptr);
            var tmp1 = Sse.StaticCast <byte, sbyte>(tmp);

            tmp1 = Sse2.Subtract(tmp1, subtmp);

            var data0  = Ssse3.Shuffle(tmp1, mask0);
            var data0f = Sse2.ConvertToVector128Single(Sse.StaticCast <sbyte, int>(data0));

            var data1  = Ssse3.Shuffle(tmp1, mask1);
            var data1f = Sse2.ConvertToVector128Single(Sse.StaticCast <sbyte, int>(data1));

            var ans = Sse2.Add(Sse2.ConvertToVector128Int32(Sse41.DotProduct(data0f, mul0, 0b11111000)), Sse2.ConvertToVector128Int32(Sse41.DotProduct(data1f, mul1, 0b11111000)));

            return(Sse41.Extract(ans, 3));
        }
Пример #16
0
        public static VectorF Normalize2D(VectorFParam1_3 vector)
        {
            #region Manual Inline
            // SSE4.1 has a native dot product instruction, dpps
            if (Sse41.IsSupported)
            {
                // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_0011_1111;
                VectorF    dp      = Sse41.DotProduct(vector, vector, control);

                return(Sse.Divide(vector, Sse.Sqrt(dp)));
            }
            // We can use SSE to vectorize the multiplication
            // There are different fastest methods to sum the resultant vector
            // on SSE3 vs SSE1
            else if (Sse3.IsSupported)
            {
                VectorF mul = Sse.Multiply(vector, vector);

                // Set W and Z to zero
                VectorF result = Sse.And(mul, MaskWAndZToZero);

                // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?)
                result = Sse3.HorizontalAdd(result, result);

                // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z)
                VectorF dp = Sse3.MoveLowAndDuplicate(result);
                return(Sse.Divide(vector, Sse.Sqrt(dp)));
            }
            else if (Sse.IsSupported)
            {
                VectorF mul = Sse.Multiply(vector, vector);

                VectorF temp = Sse.Shuffle(mul, mul, Shuffle(1, 1, 1, 1));

                mul = Sse.AddScalar(mul, temp);

                mul = Sse.Shuffle(mul, mul, Shuffle(0, 0, 0, 0));

                return(Sse.Divide(vector, Sse.Sqrt(mul)));
            }
            #endregion

            return(Normalize2D_Software(vector));
        }
Пример #17
0
        public static VectorF DotProduct3D(VectorFParam1_3 left, VectorFParam1_3 right)
        {
            // SSE4.1 has a native dot product instruction, dpps
            if (Sse41.IsSupported)
            {
                // This multiplies the first 3 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_0111_1111;
                return(Sse41.DotProduct(left, right, control));
            }
            // We can use SSE to vectorize the multiplication
            // There are different fastest methods to sum the resultant vector
            // on SSE3 vs SSE1
            else if (Sse3.IsSupported)
            {
                VectorF mul = Multiply(left, right);

                // Set W to zero
                VectorF result = And(mul, MaskWToZero);

                // Doubly horizontally adding fills the final vector with the sum
                result = HorizontalAdd(result, result);
                return(HorizontalAdd(result, result));
            }
            else if (Sse.IsSupported)
            {
                // Multiply to get the needed values
                VectorF mul = Multiply(left, right);


                // Shuffle around the values and AddScalar them
                VectorF temp = Sse.Shuffle(mul, mul, Shuffle(2, 1, 2, 1));

                mul = Sse.AddScalar(mul, temp);

                temp = Sse.Shuffle(temp, temp, Shuffle(1, 1, 1, 1));

                mul = Sse.AddScalar(mul, temp);

                return(Sse.Shuffle(mul, mul, Shuffle(0, 0, 0, 0)));
            }

            return(DotProduct3D_Software(left, right));
        }
Пример #18
0
        private unsafe void DotProductU(Span <float> scalar, Span <float> dst)
        {
            fixed(float *pdst = dst)
            fixed(float *psrc = scalar)
            {
                var pDstEnd     = pdst + dst.Length;
                var pDstCurrent = pdst;

                var scalarVector128 = Sse.LoadScalarVector128(psrc);

                while (pDstCurrent < pDstEnd)
                {
                    var dstVector = Sse.LoadVector128(pDstCurrent);
                    dstVector = Sse41.DotProduct(dstVector, scalarVector128, 0xff);
                    Sse.Store(pDstCurrent, dstVector);

                    pDstCurrent += 4;
                }
            }
        }
Пример #19
0
        public static Vector128 <float> DotProduct2D(Vector128 <float> left, Vector128 <float> right)
        {
            // SSE4.1 has a native dot product instruction, dpps
            if (Sse41.IsSupported)
            {
                // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector
                const byte control = 0b_0011_1111;
                return(Sse41.DotProduct(left, right, control));
            }
            // We can use SSE to vectorize the multiplication
            // There are different fastest methods to sum the resultant vector
            // on SSE3 vs SSE1
            else if (Sse3.IsSupported)
            {
                Vector128 <float> mul = Sse.Multiply(left, right);

                // Set W to zero
                Vector128 <float> result = Sse.And(mul, SingleConstants.MaskW);

                // Add X and Y horizontally, leaving the vector as (X+Y, Z+0, X+Y. Z+0)
                result = Sse3.HorizontalAdd(result, result);

                // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z)
                return(Sse3.MoveLowAndDuplicate(result));
            }
            else if (Sse.IsSupported)
            {
                Vector128 <float> mul = Sse.Multiply(left, right);

                Vector128 <float> temp = Sse.Shuffle(mul, mul, ShuffleValues.YYYY);

                mul = Sse.AddScalar(mul, temp);

                mul = Sse.Shuffle(mul, mul, ShuffleValues.XXXX);

                return(mul);
            }

            return(DotProduct2D_Software(left, right));
        }
Пример #20
0
        public float Vec4Length_Sse_Array()
        {
            var local = arr;
            var cnt   = local.Length;
            var sum   = 0.0f;

            unsafe
            {
                fixed(MyVector4 *ptrArr = local)
                {
                    for (int i = 0; i < cnt; i++)
                    {
                        var mmx = Sse.LoadVector128((float *)(ptrArr + i));
                        mmx = Sse41.DotProduct(mmx, mmx, 0xF1);
                        var l2 = mmx.GetElement(0);
                        sum += MathF.Sqrt(l2);
                    }
                }
            }
            //if (Math.Abs(sum - test) > 1e-5) throw new Exception("FAIL");
            return(sum);
        }
Пример #21
0
        public override ulong Run(CancellationToken cancellationToken)
        {
            if (!Sse41.IsSupported)
            {
                return(0uL);
            }

            var randomFloatingSpan = new Span <float>(new[] { RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT });
            var dst = new Span <float>(new[]
                                       { ANOTHER_RANDOM_FLOAT, ANOTHER_RANDOM_FLOAT, ANOTHER_RANDOM_FLOAT, ANOTHER_RANDOM_FLOAT });
            var iterations = 0uL;

            unsafe
            {
                fixed(float *pdst = dst)
                fixed(float *psrc = randomFloatingSpan)
                {
                    var srcVector = Sse.LoadVector128(psrc);
                    var dstVector = Sse.LoadVector128(pdst);

                    while (!cancellationToken.IsCancellationRequested)
                    {
                        for (var j = 0; j < LENGTH; j++)
                        {
                            // Bit 4-7 (F): Which parts should be multiplied -> F: All
                            // Bit 0-3 (F): Where the result should be placed -> F: Everywhere
                            dstVector = Sse41.DotProduct(dstVector, srcVector, 0xFF);
                        }

                        Sse.Store(pdst, dstVector);

                        iterations++;
                    }
                }
            }

            return(iterations);
        }
Пример #22
0
        //        x86/x64 SIMD命令一覧表 (SSE~AVX2)
        //https://www.officedaytime.com/tips/simd.html
        //算術演算 ドット積 DPPS
        //Intrinsics SSE41 DotProduct
        private unsafe long Test7_Intrinsics_SSE41_DotProduct_float(byte[] vs)
        {
            long total      = 0;
            int  simdLength = Vector128 <int> .Count;
            int  lastIndex  = vs.Length - (vs.Length % simdLength);

            fixed(byte *p = vs)
            {
                for (int i = 0; i < lastIndex; i += simdLength)
                {
                    Vector128 <int> v  = Sse41.ConvertToVector128Int32(p + i);
                    var             vv = Sse2.ConvertToVector128Single(v);
                    //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1)
                    Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001);
                    total += (long)dp.GetElement(0);
                }
            }

            for (int i = lastIndex; i < vs.Length; i++)
            {
                total += vs[i] * vs[i];
            }
            return(total);
        }
Пример #23
0
 public static unsafe float Dot(Float4 a, Float4 b) => Sse41.DotProduct(Sse.LoadVector128(&a.X), Sse.LoadVector128(&b.X), 0xFF).ToScalar();
Пример #24
0
        static void Main(string[] args)
        {
            const int DATASIZE = 1048576;
            const int TEST     = 64;

            float[] time = new float[4];

            Console.WriteLine("Test retry: " + TEST);
            Console.WriteLine("____________________");


            float[] f = new float[DATASIZE];
            float[] g = new float[DATASIZE];

            for (int i = 0; i < DATASIZE; i++)
            {
                Random random = new Random();
                f[i] = (float)(random.NextDouble() * 2 - 1);
                g[i] = (float)(random.NextDouble() * 2 - 1);
            }

            float[] suma = new float[4] {
                0, 0, 0, 0
            };

            Stopwatch stopwatch = new Stopwatch();

            stopwatch.Start();
            //FPU
            for (int t = 0; t < TEST; t++)
            {
                suma[0] = 0;
                for (int i = 0; i < DATASIZE; i++)
                {
                    suma[0] += f[i] * g[i];
                }
            }
            stopwatch.Stop();
            Console.WriteLine("FPU loop time " + stopwatch.Elapsed.TotalMilliseconds);
            time[0] = (float)stopwatch.Elapsed.TotalMilliseconds;

            stopwatch.Restart();

            //FPUx4
            for (int t = 0; t < TEST; t++)
            {
                suma[1] = 0;
                for (int i = 0; i < DATASIZE / 4 * 4; i += 4)
                {
                    suma[1] += f[i] * g[i];
                    suma[1] += f[i + 1] * g[i + 1];
                    suma[1] += f[i + 2] * g[i + 2];
                    suma[1] += f[i + 3] * g[i + 3];
                }
            }
            stopwatch.Stop();
            Console.WriteLine("FPU 4xloop time " + stopwatch.Elapsed.TotalMilliseconds);
            time[1] = (float)stopwatch.Elapsed.TotalMilliseconds;

            stopwatch.Restart();

            //SSE
            for (int t = 0; t < TEST; t++)
            {
                suma[2] = 0;
                unsafe
                {
                    fixed(float *ptr_s3 = &suma[2])
                    fixed(float *ptr_f = &f[0])
                    fixed(float *ptr_g = &g[0])
                    {
                        Vector128 <float> suma3 = Sse.LoadVector128(ptr_s3);

                        for (int i = 0; i < DATASIZE; i += 4)
                        {
                            {
                                Vector128 <float> vector  = Sse.LoadVector128(ptr_f + i);
                                Vector128 <float> vector2 = Sse.LoadVector128(ptr_g + i);

                                suma3 = Sse.Add(suma3, Sse41.DotProduct(vector, vector2, 255));
                            }
                        }
                        Sse.Store(ptr_s3, suma3);
                    }
                }
            }

            stopwatch.Stop();
            Console.WriteLine("SSE loop time " + stopwatch.Elapsed.TotalMilliseconds);
            time[2] = (float)stopwatch.Elapsed.TotalMilliseconds;

            stopwatch.Restart();
            for (int t = 0; t < TEST; t++)
            {
                suma[3] = 0;
                ///SSEx4
                unsafe
                {
                    fixed(float *ptr_s4 = &suma[3])
                    fixed(float *ptr_f = &f[0])
                    fixed(float *ptr_g = &g[0])
                    {
                        Vector128 <float> suma4 = Sse.LoadVector128(ptr_s4);

                        for (int i = 0; i < DATASIZE / 4 * 4; i += 16)
                        {
                            {
                                Vector128 <float> vector  = Sse.LoadVector128(ptr_f + i);
                                Vector128 <float> vector2 = Sse.LoadVector128(ptr_g + i);
                                suma4 = Sse.Add(suma4, Sse41.DotProduct(vector, vector2, 255));


                                vector  = Sse.LoadVector128(ptr_f + i + 4);
                                vector2 = Sse.LoadVector128(ptr_g + i + 4);
                                suma4   = Sse.Add(suma4, Sse41.DotProduct(vector, vector2, 255));


                                vector  = Sse.LoadVector128(ptr_f + i + 8);
                                vector2 = Sse.LoadVector128(ptr_g + i + 8);
                                suma4   = Sse.Add(suma4, Sse41.DotProduct(vector, vector2, 255));


                                vector  = Sse.LoadVector128(ptr_f + i + 12);
                                vector2 = Sse.LoadVector128(ptr_g + i + 12);
                                suma4   = Sse.Add(suma4, Sse41.DotProduct(vector, vector2, 255));
                            }
                        }
                        Sse.Store(ptr_s4, suma4);
                    }
                }
            }


            stopwatch.Stop();
            Console.WriteLine("SSE 4xloop time " + stopwatch.Elapsed.TotalMilliseconds);
            time[3] = (float)stopwatch.Elapsed.TotalMilliseconds;

            Console.WriteLine("\nFPU loop " + 1);
            Console.WriteLine("FPU 4xloop " + time[1] / time[0]);
            Console.WriteLine("SSE loop " + time[2] / time[0]);
            Console.WriteLine("SSE 4xloop " + time[3] / time[0]);



            Console.WriteLine("\nFPU loop result " + suma[0]);
            Console.WriteLine("FPU 4xloop result " + suma[1]);
            Console.WriteLine("SSE loop result " + suma[2]);
            Console.WriteLine("SSE 4xloop result " + suma[3]);

            Console.ReadKey();
        }
Пример #25
0
        // 0x7F = 0111 1111 ~ means we don't want the w-component multiplied
        // and the result written to all 4 components
        public static unsafe Float4 Dot(Float4 a, Float4 b, byte control = 0x7F)
        {
            Sse.Store(&a.X, Sse41.DotProduct(Sse.LoadVector128(&a.X), Sse.LoadVector128(&b.X), control));

            return(a);
        }
Пример #26
0
        static unsafe int Main(string[] args)
        {
            int testResult = Pass;

            if (Sse41.IsSupported)
            {
                using (TestTable <float> floatTable = new TestTable <float>(new float[4] {
                    1, -5, 100, 0
                }, new float[4] {
                    22, -1, -50, 0
                }, new float[4]))
                {
                    var vf1 = Unsafe.Read <Vector128 <float> >(floatTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <float> >(floatTable.inArray2Ptr);

                    var vf3 = Sse41.DotProduct(vf1, vf2, 255);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1]) +
                                                                   (x[2] * y[2]) + (x[3] * y[3]))))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Sse41.DotProduct(vf1, vf2, 127);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1]) +
                                                                   (x[2] * y[2]))))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Sse41.DotProduct(vf1, vf2, 63);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => z.All(result => result == ((x[0] * y[0]) + (x[1] * y[1])))))
                    {
                        Console.WriteLine("3 SSE41 DotProduct failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Sse41.DotProduct(vf1, vf2, 85);
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => z[0] == ((x[0] * y[0]) + (x[2] * y[2])) &&
                                                z[2] == ((x[0] * y[0]) + (x[2] * y[2])) &&
                                                z[1] == 0 && z[3] == 0))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = (Vector128 <float>) typeof(Sse41).GetMethod(nameof(Sse41.DotProduct), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(255) });
                    Unsafe.Write(floatTable.outArrayPtr, vf3);

                    if (!floatTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1]) +
                                                                   (x[2] * y[2]) + (x[3] * y[3]))))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on float:");
                        foreach (var item in floatTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }

                using (TestTable <double> doubleTable = new TestTable <double>(new double[2] {
                    1, -5
                }, new double[2] {
                    22, -1
                }, new double[2]))
                {
                    var vf1 = Unsafe.Read <Vector128 <double> >(doubleTable.inArray1Ptr);
                    var vf2 = Unsafe.Read <Vector128 <double> >(doubleTable.inArray2Ptr);

                    var vf3 = Sse41.DotProduct(vf1, vf2, 51);
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1]))))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Sse41.DotProduct(vf1, vf2, 19);
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]))))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Sse41.DotProduct(vf1, vf2, 17);
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => z[0] == (x[0] * y[0]) &&
                                                 z[1] == 0))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = Sse41.DotProduct(vf1, vf2, 33);
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => z[0] == (x[1] * y[1]) &&
                                                 z[1] == 0))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }

                    vf3 = (Vector128 <double>) typeof(Sse41).GetMethod(nameof(Sse41.DotProduct), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(51) });
                    Unsafe.Write(doubleTable.outArrayPtr, vf3);

                    if (!doubleTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1]))))
                    {
                        Console.WriteLine("SSE41 DotProduct failed on double:");
                        foreach (var item in doubleTable.outArray)
                        {
                            Console.Write(item + ", ");
                        }
                        Console.WriteLine();
                        testResult = Fail;
                    }
                }
            }

            return(testResult);
        }
Пример #27
0
 public static __m128 _mm_dp_ps(__m128 a, __m128 b, byte control) => Sse41.DotProduct(a, b, control);
Пример #28
0
 public static unsafe float Length(Float3 a, Float3 b) => Sse.SqrtScalar(Sse41.DotProduct(Sse.LoadVector128(&a.X), Sse.LoadVector128(&b.X), 0xFF)).ToScalar();
Пример #29
0
 public static Vector128 <double> _mm_dp_pd(Vector128 <double> left, Vector128 <double> right, byte control)
 {
     return(Sse41.DotProduct(left, right, control));
 }
Пример #30
0
 public static Vector128 <float> _mm_dp_ps(Vector128 <float> left, Vector128 <float> right, byte control)
 {
     return(Sse41.DotProduct(left, right, control));
 }