//↑をマルチスレッド化 private unsafe long Test17_Intrinsics_SSE41_DotProduct_float_MT(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count; int rangeSize = vs.Length / Environment.ProcessorCount; Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize), (range) => { long subtotal = 0; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); //vTotal = Sse.Add(vTotal, dp); subtotal += (long)dp.GetElement(0); } } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
public static Vector128 <float> DotProduct4D(Vector128 <float> left, Vector128 <float> right) { if (Sse41.IsSupported) { // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse41.DotProduct(left, right, control)); } else if (Sse3.IsSupported) { // Multiply the two vectors to get all the needed elements Vector128 <float> mul = Sse.Multiply(left, right); // Double horizontal add is the same as broadcasting the sum of all 4 mul = Sse3.HorizontalAdd(mul, mul); return(Sse3.HorizontalAdd(mul, mul)); } else if (Sse.IsSupported) { Vector128 <float> copy = right; // Multiply the two vectors to get all the needed elements Vector128 <float> mul = Sse.Multiply(left, copy); copy = Sse.Shuffle(copy, mul, ShuffleValues.XXXY); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, ShuffleValues.XXWX); mul = Sse.Add(mul, copy); return(Sse.Shuffle(mul, mul, ShuffleValues.ZZZZ)); } return(DotProduct4D_Software(left, right)); }
public static VectorF DotProduct4D(VectorFParam1_3 left, VectorFParam1_3 right) { if (Sse41.IsSupported) { // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse41.DotProduct(left, right, control)); } else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(left, right); mul = Sse3.HorizontalAdd(mul, mul); return(Sse3.HorizontalAdd(mul, mul)); } else if (Sse.IsSupported) { VectorF copy = right; VectorF mul = Sse.Multiply(left, copy); copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2))); } return(DotProduct4D_Software(left, right)); }
public static Vector4F DistanceSquared4D(Vector4FParam1_3 left, Vector4FParam1_3 right) { if (Sse41.IsSupported) { Vector4F diff = Sse.Subtract(left, right); // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse41.DotProduct(diff, diff, control)); } else if (Sse3.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); mul = Sse3.HorizontalAdd(mul, mul); return(Sse3.HorizontalAdd(mul, mul)); } else if (Sse.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F copy = diff; Vector4F mul = Sse.Multiply(diff, copy); copy = Sse.Shuffle(copy, mul, Helpers.Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Helpers.Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Shuffle(mul, mul, Helpers.Shuffle(2, 2, 2, 2))); } return(DistanceSquared4D_Software(left, right)); }
public static Vector256 <double> DotProduct2D(Vector256 <double> left, Vector256 <double> right) { // SSE4.1 has a native dot product instruction, dppd if (Sse41.IsSupported) { // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; Vector2D dp = Sse41.DotProduct(left.GetLower(), right.GetLower(), control); return(Helpers.DuplicateToVector256(dp)); } else if (Sse3.IsSupported) { Vector2D tmp = Sse2.Multiply(left.GetLower(), right.GetLower()); return(Helpers.DuplicateToVector256(Sse3.HorizontalAdd(tmp, tmp))); } else if (Sse2.IsSupported) { Vector2D tmp = Sse2.Multiply(left.GetLower(), right.GetLower()); Vector2D shuf = Sse2.Shuffle(tmp, tmp, ShuffleValues.YXYX); var dot = Sse2.Add(tmp, shuf); return(dot.ToVector256Unsafe().WithUpper(dot)); } return(DotProduct2D_Software(left, right)); }
public static VectorF Normalize4D(VectorFParam1_3 vector) { if (Sse41.IsSupported) { // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse.Divide(vector, Sse41.DotProduct(vector, vector, control))); } else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); mul = Sse3.HorizontalAdd(mul, mul); return(Sse.Divide(vector, Sse.Sqrt(Sse3.HorizontalAdd(mul, mul)))); } else if (Sse.IsSupported) { VectorF copy = vector; VectorF mul = Sse.Multiply(vector, copy); copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Divide(vector, Sse.Sqrt(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2))))); } return(Normalize4D_Software(vector)); }
//↑をオーバーフローしない程度に配列を分割して計算 private unsafe long Test28_Intrinsics_SSE41_DotProduct_float_MT_Kai(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count * 4; //集計用のVector128<float> vTotalで扱える最大要素数 = 1032 //floatの仮数部24bit / byte型最大値 * byte型最大値 //16777215 / (255 * 255) * 4 = 1032.0471 これの小数点以下切り捨てを //1区分あたりの要素数(分割サイズ) int rangeSize = ((1 << 24) - 1) / (byte.MaxValue * byte.MaxValue) * Vector128 <float> .Count;//1032 Parallel.ForEach( Partitioner.Create(0, vs.Length, rangeSize), (range) => { var vTotal = Vector128 <float> .Zero; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 4); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 8); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 12); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる vTotal = Sse.Add(vTotal, dp); } } long subtotal = 0; float *f = stackalloc float[Vector128 <float> .Count]; Sse.Store(f, vTotal); for (int i = 0; i < Vector128 <float> .Count; i++) { subtotal += (long)f[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
public static unsafe Float4 operator *(Float4x4 a, Float4 b) => new Float4 { X = Sse41.DotProduct(Sse.LoadVector128(&a.M11), Sse.LoadVector128(&b.X), 0xFF).ToScalar(), Y = Sse41.DotProduct(Sse.LoadVector128(&a.M21), Sse.LoadVector128(&b.X), 0xFF).ToScalar(), Z = Sse41.DotProduct(Sse.LoadVector128(&a.M31), Sse.LoadVector128(&b.X), 0xFF).ToScalar(), W = Sse41.DotProduct(Sse.LoadVector128(&a.M41), Sse.LoadVector128(&b.X), 0xFF).ToScalar() };
static unsafe float Length_Sse_V6_Helper(MyVector4 vec) { var mmx = *((Vector128 <float> *) & vec); mmx = Sse41.DotProduct(mmx, mmx, 0xF1); var l2 = mmx.GetElement(0); return(MathF.Sqrt(l2)); }
static unsafe float Length_Sse_V2_Helper(MyVector4 vec) { var ptr = (float *)&vec; var mmx = Sse.LoadVector128(ptr); mmx = Sse41.DotProduct(mmx, mmx, 0xF1); var l2 = mmx.GetElement(0); return(MathF.Sqrt(l2)); }
public static unsafe Float4 Normalize(Float4 a) { Vector128 <float> v = Sse.LoadVector128(&a.X); Sse.Store(&a.X, Sse.Multiply(Sse.LoadVector128(&a.X), Sse.ReciprocalSqrt(Sse41.DotProduct(v, v, 0x7F)))); return(a); }
public static unsafe double DotProduct2DSSE( double x1, double y1, double x2, double y2) { #if HAS_INTRINSICS if (Sse41.IsSupported) { var vf1 = Sse2.LoadVector128(&x1); var vf2 = Sse2.LoadVector128(&x2); Unsafe.Write(&x1, Sse41.DotProduct(vf1, vf2, 51)); return(x1); } #endif return((x1 * x2) + (y1 * y2)); }
//Intrinsics SSE41 DotProduct、ループの中で4個づつ処理 private unsafe long Test8_Intrinsics_SSE41_DotProduct_float(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count * 4; int lastIndex = vs.Length - (vs.Length % simdLength); var vTotal = Vector128 <float> .Zero; fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 4); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 8); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 12); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる vTotal = Sse.Add(vTotal, dp); } } float *f = stackalloc float[Vector128 <int> .Count]; Sse.Store(f, vTotal); for (int i = 0; i < Vector128 <int> .Count; i++) { total += (long)f[i]; } for (int i = lastIndex; i < vs.Length; i++) { total += vs[i] * vs[i]; } return(total); }
public static Vector4F DistanceSquared2D(Vector4FParam1_3 left, Vector4FParam1_3 right) { // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { Vector4F diff = Sse.Subtract(left, right); // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; return(Sse41.DotProduct(diff, diff, control)); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); // Set W and Z to zero Vector4F result = Sse.And(mul, MaskWAndZToZero); // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?) result = Sse3.HorizontalAdd(result, result); // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z) return(Sse3.MoveLowAndDuplicate(result)); } else if (Sse.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); Vector4F temp = Sse.Shuffle(mul, mul, Helpers.Shuffle(1, 1, 1, 1)); mul = Sse.AddScalar(mul, temp); mul = Sse.Shuffle(mul, mul, Helpers.Shuffle(0, 0, 0, 0)); return(mul); } return(DistanceSquared2D_Software(left, right)); }
public unsafe int ParseSIMD() { var tmp = Sse2.LoadVector128(ptr); var tmp1 = Sse.StaticCast <byte, sbyte>(tmp); tmp1 = Sse2.Subtract(tmp1, subtmp); var data0 = Ssse3.Shuffle(tmp1, mask0); var data0f = Sse2.ConvertToVector128Single(Sse.StaticCast <sbyte, int>(data0)); var data1 = Ssse3.Shuffle(tmp1, mask1); var data1f = Sse2.ConvertToVector128Single(Sse.StaticCast <sbyte, int>(data1)); var ans = Sse2.Add(Sse2.ConvertToVector128Int32(Sse41.DotProduct(data0f, mul0, 0b11111000)), Sse2.ConvertToVector128Int32(Sse41.DotProduct(data1f, mul1, 0b11111000))); return(Sse41.Extract(ans, 3)); }
public static VectorF Normalize2D(VectorFParam1_3 vector) { #region Manual Inline // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; VectorF dp = Sse41.DotProduct(vector, vector, control); return(Sse.Divide(vector, Sse.Sqrt(dp))); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); // Set W and Z to zero VectorF result = Sse.And(mul, MaskWAndZToZero); // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?) result = Sse3.HorizontalAdd(result, result); // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z) VectorF dp = Sse3.MoveLowAndDuplicate(result); return(Sse.Divide(vector, Sse.Sqrt(dp))); } else if (Sse.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); VectorF temp = Sse.Shuffle(mul, mul, Shuffle(1, 1, 1, 1)); mul = Sse.AddScalar(mul, temp); mul = Sse.Shuffle(mul, mul, Shuffle(0, 0, 0, 0)); return(Sse.Divide(vector, Sse.Sqrt(mul))); } #endregion return(Normalize2D_Software(vector)); }
public static VectorF DotProduct3D(VectorFParam1_3 left, VectorFParam1_3 right) { // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { // This multiplies the first 3 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0111_1111; return(Sse41.DotProduct(left, right, control)); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { VectorF mul = Multiply(left, right); // Set W to zero VectorF result = And(mul, MaskWToZero); // Doubly horizontally adding fills the final vector with the sum result = HorizontalAdd(result, result); return(HorizontalAdd(result, result)); } else if (Sse.IsSupported) { // Multiply to get the needed values VectorF mul = Multiply(left, right); // Shuffle around the values and AddScalar them VectorF temp = Sse.Shuffle(mul, mul, Shuffle(2, 1, 2, 1)); mul = Sse.AddScalar(mul, temp); temp = Sse.Shuffle(temp, temp, Shuffle(1, 1, 1, 1)); mul = Sse.AddScalar(mul, temp); return(Sse.Shuffle(mul, mul, Shuffle(0, 0, 0, 0))); } return(DotProduct3D_Software(left, right)); }
private unsafe void DotProductU(Span <float> scalar, Span <float> dst) { fixed(float *pdst = dst) fixed(float *psrc = scalar) { var pDstEnd = pdst + dst.Length; var pDstCurrent = pdst; var scalarVector128 = Sse.LoadScalarVector128(psrc); while (pDstCurrent < pDstEnd) { var dstVector = Sse.LoadVector128(pDstCurrent); dstVector = Sse41.DotProduct(dstVector, scalarVector128, 0xff); Sse.Store(pDstCurrent, dstVector); pDstCurrent += 4; } } }
public static Vector128 <float> DotProduct2D(Vector128 <float> left, Vector128 <float> right) { // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; return(Sse41.DotProduct(left, right, control)); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { Vector128 <float> mul = Sse.Multiply(left, right); // Set W to zero Vector128 <float> result = Sse.And(mul, SingleConstants.MaskW); // Add X and Y horizontally, leaving the vector as (X+Y, Z+0, X+Y. Z+0) result = Sse3.HorizontalAdd(result, result); // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z) return(Sse3.MoveLowAndDuplicate(result)); } else if (Sse.IsSupported) { Vector128 <float> mul = Sse.Multiply(left, right); Vector128 <float> temp = Sse.Shuffle(mul, mul, ShuffleValues.YYYY); mul = Sse.AddScalar(mul, temp); mul = Sse.Shuffle(mul, mul, ShuffleValues.XXXX); return(mul); } return(DotProduct2D_Software(left, right)); }
public float Vec4Length_Sse_Array() { var local = arr; var cnt = local.Length; var sum = 0.0f; unsafe { fixed(MyVector4 *ptrArr = local) { for (int i = 0; i < cnt; i++) { var mmx = Sse.LoadVector128((float *)(ptrArr + i)); mmx = Sse41.DotProduct(mmx, mmx, 0xF1); var l2 = mmx.GetElement(0); sum += MathF.Sqrt(l2); } } } //if (Math.Abs(sum - test) > 1e-5) throw new Exception("FAIL"); return(sum); }
public override ulong Run(CancellationToken cancellationToken) { if (!Sse41.IsSupported) { return(0uL); } var randomFloatingSpan = new Span <float>(new[] { RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT }); var dst = new Span <float>(new[] { ANOTHER_RANDOM_FLOAT, ANOTHER_RANDOM_FLOAT, ANOTHER_RANDOM_FLOAT, ANOTHER_RANDOM_FLOAT }); var iterations = 0uL; unsafe { fixed(float *pdst = dst) fixed(float *psrc = randomFloatingSpan) { var srcVector = Sse.LoadVector128(psrc); var dstVector = Sse.LoadVector128(pdst); while (!cancellationToken.IsCancellationRequested) { for (var j = 0; j < LENGTH; j++) { // Bit 4-7 (F): Which parts should be multiplied -> F: All // Bit 0-3 (F): Where the result should be placed -> F: Everywhere dstVector = Sse41.DotProduct(dstVector, srcVector, 0xFF); } Sse.Store(pdst, dstVector); iterations++; } } } return(iterations); }
// x86/x64 SIMD命令一覧表 (SSE~AVX2) //https://www.officedaytime.com/tips/simd.html //算術演算 ドット積 DPPS //Intrinsics SSE41 DotProduct private unsafe long Test7_Intrinsics_SSE41_DotProduct_float(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count; int lastIndex = vs.Length - (vs.Length % simdLength); fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); total += (long)dp.GetElement(0); } } for (int i = lastIndex; i < vs.Length; i++) { total += vs[i] * vs[i]; } return(total); }
public static unsafe float Dot(Float4 a, Float4 b) => Sse41.DotProduct(Sse.LoadVector128(&a.X), Sse.LoadVector128(&b.X), 0xFF).ToScalar();
static void Main(string[] args) { const int DATASIZE = 1048576; const int TEST = 64; float[] time = new float[4]; Console.WriteLine("Test retry: " + TEST); Console.WriteLine("____________________"); float[] f = new float[DATASIZE]; float[] g = new float[DATASIZE]; for (int i = 0; i < DATASIZE; i++) { Random random = new Random(); f[i] = (float)(random.NextDouble() * 2 - 1); g[i] = (float)(random.NextDouble() * 2 - 1); } float[] suma = new float[4] { 0, 0, 0, 0 }; Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); //FPU for (int t = 0; t < TEST; t++) { suma[0] = 0; for (int i = 0; i < DATASIZE; i++) { suma[0] += f[i] * g[i]; } } stopwatch.Stop(); Console.WriteLine("FPU loop time " + stopwatch.Elapsed.TotalMilliseconds); time[0] = (float)stopwatch.Elapsed.TotalMilliseconds; stopwatch.Restart(); //FPUx4 for (int t = 0; t < TEST; t++) { suma[1] = 0; for (int i = 0; i < DATASIZE / 4 * 4; i += 4) { suma[1] += f[i] * g[i]; suma[1] += f[i + 1] * g[i + 1]; suma[1] += f[i + 2] * g[i + 2]; suma[1] += f[i + 3] * g[i + 3]; } } stopwatch.Stop(); Console.WriteLine("FPU 4xloop time " + stopwatch.Elapsed.TotalMilliseconds); time[1] = (float)stopwatch.Elapsed.TotalMilliseconds; stopwatch.Restart(); //SSE for (int t = 0; t < TEST; t++) { suma[2] = 0; unsafe { fixed(float *ptr_s3 = &suma[2]) fixed(float *ptr_f = &f[0]) fixed(float *ptr_g = &g[0]) { Vector128 <float> suma3 = Sse.LoadVector128(ptr_s3); for (int i = 0; i < DATASIZE; i += 4) { { Vector128 <float> vector = Sse.LoadVector128(ptr_f + i); Vector128 <float> vector2 = Sse.LoadVector128(ptr_g + i); suma3 = Sse.Add(suma3, Sse41.DotProduct(vector, vector2, 255)); } } Sse.Store(ptr_s3, suma3); } } } stopwatch.Stop(); Console.WriteLine("SSE loop time " + stopwatch.Elapsed.TotalMilliseconds); time[2] = (float)stopwatch.Elapsed.TotalMilliseconds; stopwatch.Restart(); for (int t = 0; t < TEST; t++) { suma[3] = 0; ///SSEx4 unsafe { fixed(float *ptr_s4 = &suma[3]) fixed(float *ptr_f = &f[0]) fixed(float *ptr_g = &g[0]) { Vector128 <float> suma4 = Sse.LoadVector128(ptr_s4); for (int i = 0; i < DATASIZE / 4 * 4; i += 16) { { Vector128 <float> vector = Sse.LoadVector128(ptr_f + i); Vector128 <float> vector2 = Sse.LoadVector128(ptr_g + i); suma4 = Sse.Add(suma4, Sse41.DotProduct(vector, vector2, 255)); vector = Sse.LoadVector128(ptr_f + i + 4); vector2 = Sse.LoadVector128(ptr_g + i + 4); suma4 = Sse.Add(suma4, Sse41.DotProduct(vector, vector2, 255)); vector = Sse.LoadVector128(ptr_f + i + 8); vector2 = Sse.LoadVector128(ptr_g + i + 8); suma4 = Sse.Add(suma4, Sse41.DotProduct(vector, vector2, 255)); vector = Sse.LoadVector128(ptr_f + i + 12); vector2 = Sse.LoadVector128(ptr_g + i + 12); suma4 = Sse.Add(suma4, Sse41.DotProduct(vector, vector2, 255)); } } Sse.Store(ptr_s4, suma4); } } } stopwatch.Stop(); Console.WriteLine("SSE 4xloop time " + stopwatch.Elapsed.TotalMilliseconds); time[3] = (float)stopwatch.Elapsed.TotalMilliseconds; Console.WriteLine("\nFPU loop " + 1); Console.WriteLine("FPU 4xloop " + time[1] / time[0]); Console.WriteLine("SSE loop " + time[2] / time[0]); Console.WriteLine("SSE 4xloop " + time[3] / time[0]); Console.WriteLine("\nFPU loop result " + suma[0]); Console.WriteLine("FPU 4xloop result " + suma[1]); Console.WriteLine("SSE loop result " + suma[2]); Console.WriteLine("SSE 4xloop result " + suma[3]); Console.ReadKey(); }
// 0x7F = 0111 1111 ~ means we don't want the w-component multiplied // and the result written to all 4 components public static unsafe Float4 Dot(Float4 a, Float4 b, byte control = 0x7F) { Sse.Store(&a.X, Sse41.DotProduct(Sse.LoadVector128(&a.X), Sse.LoadVector128(&b.X), control)); return(a); }
static unsafe int Main(string[] args) { int testResult = Pass; if (Sse41.IsSupported) { using (TestTable <float> floatTable = new TestTable <float>(new float[4] { 1, -5, 100, 0 }, new float[4] { 22, -1, -50, 0 }, new float[4])) { var vf1 = Unsafe.Read <Vector128 <float> >(floatTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <float> >(floatTable.inArray2Ptr); var vf3 = Sse41.DotProduct(vf1, vf2, 255); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1]) + (x[2] * y[2]) + (x[3] * y[3])))) { Console.WriteLine("SSE41 DotProduct failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Sse41.DotProduct(vf1, vf2, 127); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1]) + (x[2] * y[2])))) { Console.WriteLine("SSE41 DotProduct failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Sse41.DotProduct(vf1, vf2, 63); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => z.All(result => result == ((x[0] * y[0]) + (x[1] * y[1]))))) { Console.WriteLine("3 SSE41 DotProduct failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Sse41.DotProduct(vf1, vf2, 85); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => z[0] == ((x[0] * y[0]) + (x[2] * y[2])) && z[2] == ((x[0] * y[0]) + (x[2] * y[2])) && z[1] == 0 && z[3] == 0)) { Console.WriteLine("SSE41 DotProduct failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = (Vector128 <float>) typeof(Sse41).GetMethod(nameof(Sse41.DotProduct), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(255) }); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1]) + (x[2] * y[2]) + (x[3] * y[3])))) { Console.WriteLine("SSE41 DotProduct failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } using (TestTable <double> doubleTable = new TestTable <double>(new double[2] { 1, -5 }, new double[2] { 22, -1 }, new double[2])) { var vf1 = Unsafe.Read <Vector128 <double> >(doubleTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <double> >(doubleTable.inArray2Ptr); var vf3 = Sse41.DotProduct(vf1, vf2, 51); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1])))) { Console.WriteLine("SSE41 DotProduct failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Sse41.DotProduct(vf1, vf2, 19); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0])))) { Console.WriteLine("SSE41 DotProduct failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Sse41.DotProduct(vf1, vf2, 17); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => z[0] == (x[0] * y[0]) && z[1] == 0)) { Console.WriteLine("SSE41 DotProduct failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = Sse41.DotProduct(vf1, vf2, 33); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => z[0] == (x[1] * y[1]) && z[1] == 0)) { Console.WriteLine("SSE41 DotProduct failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } vf3 = (Vector128 <double>) typeof(Sse41).GetMethod(nameof(Sse41.DotProduct), new Type[] { vf1.GetType(), vf2.GetType(), typeof(byte) }).Invoke(null, new object[] { vf1, vf2, (byte)(51) }); Unsafe.Write(doubleTable.outArrayPtr, vf3); if (!doubleTable.CheckResult((x, y, z) => z.All(result => result == (x[0] * y[0]) + (x[1] * y[1])))) { Console.WriteLine("SSE41 DotProduct failed on double:"); foreach (var item in doubleTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
public static __m128 _mm_dp_ps(__m128 a, __m128 b, byte control) => Sse41.DotProduct(a, b, control);
public static unsafe float Length(Float3 a, Float3 b) => Sse.SqrtScalar(Sse41.DotProduct(Sse.LoadVector128(&a.X), Sse.LoadVector128(&b.X), 0xFF)).ToScalar();
public static Vector128 <double> _mm_dp_pd(Vector128 <double> left, Vector128 <double> right, byte control) { return(Sse41.DotProduct(left, right, control)); }
public static Vector128 <float> _mm_dp_ps(Vector128 <float> left, Vector128 <float> right, byte control) { return(Sse41.DotProduct(left, right, control)); }