public static Vector128 <float> DotProduct4D(Vector128 <float> left, Vector128 <float> right) { if (Sse41.IsSupported) { // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse41.DotProduct(left, right, control)); } else if (Sse3.IsSupported) { // Multiply the two vectors to get all the needed elements Vector128 <float> mul = Sse.Multiply(left, right); // Double horizontal add is the same as broadcasting the sum of all 4 mul = Sse3.HorizontalAdd(mul, mul); return(Sse3.HorizontalAdd(mul, mul)); } else if (Sse.IsSupported) { Vector128 <float> copy = right; // Multiply the two vectors to get all the needed elements Vector128 <float> mul = Sse.Multiply(left, copy); copy = Sse.Shuffle(copy, mul, ShuffleValues.XXXY); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, ShuffleValues.XXWX); mul = Sse.Add(mul, copy); return(Sse.Shuffle(mul, mul, ShuffleValues.ZZZZ)); } return(DotProduct4D_Software(left, right)); }
public void RunFldScenario() { var result = Sse.Add(_fld1, _fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); }
public override ulong Run(CancellationToken cancellationToken) { if (!Sse.IsSupported) { return(0uL); } var randomFloatingSpan = new Span <float>(new[] { RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT, RANDOM_FLOAT }); var dst = new Span <float>(new float[4]); var iterations = 0uL; unsafe { fixed(float *pdst = dst) fixed(float *psrc = randomFloatingSpan) { var srcVector = Sse.LoadVector128(psrc); var dstVector = Sse.LoadVector128(pdst); while (!cancellationToken.IsCancellationRequested) { for (var j = 0; j < LENGTH; j++) { dstVector = Sse.Add(dstVector, srcVector); } Sse.Store(pdst, dstVector); iterations++; } } } return(iterations); }
public static VectorF DotProduct4D(VectorFParam1_3 left, VectorFParam1_3 right) { if (Sse41.IsSupported) { // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse41.DotProduct(left, right, control)); } else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(left, right); mul = Sse3.HorizontalAdd(mul, mul); return(Sse3.HorizontalAdd(mul, mul)); } else if (Sse.IsSupported) { VectorF copy = right; VectorF mul = Sse.Multiply(left, copy); copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2))); } return(DotProduct4D_Software(left, right)); }
public static Vector4F DistanceSquared4D(Vector4FParam1_3 left, Vector4FParam1_3 right) { if (Sse41.IsSupported) { Vector4F diff = Sse.Subtract(left, right); // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse41.DotProduct(diff, diff, control)); } else if (Sse3.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); mul = Sse3.HorizontalAdd(mul, mul); return(Sse3.HorizontalAdd(mul, mul)); } else if (Sse.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F copy = diff; Vector4F mul = Sse.Multiply(diff, copy); copy = Sse.Shuffle(copy, mul, Helpers.Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Helpers.Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Shuffle(mul, mul, Helpers.Shuffle(2, 2, 2, 2))); } return(DistanceSquared4D_Software(left, right)); }
public static VectorF Normalize4D(VectorFParam1_3 vector) { if (Sse41.IsSupported) { // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse.Divide(vector, Sse41.DotProduct(vector, vector, control))); } else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); mul = Sse3.HorizontalAdd(mul, mul); return(Sse.Divide(vector, Sse.Sqrt(Sse3.HorizontalAdd(mul, mul)))); } else if (Sse.IsSupported) { VectorF copy = vector; VectorF mul = Sse.Multiply(vector, copy); copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Divide(vector, Sse.Sqrt(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2))))); } return(Normalize4D_Software(vector)); }
public void RunStructFldScenario(SimpleBinaryOpTest__AddSingle testClass) { var result = Sse.Add(_fld1, _fld2); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); }
private static void AddSse(ReadOnlySpan <float> a, ReadOnlySpan <float> b, Span <float> s) { var remainder = a.Length & 3; var length = a.Length - remainder; fixed(float *ptr = a) { fixed(float *ptrB = b) { fixed(float *ptrS = s) { for (var i = 0; i < length; i += 4) { var j = Sse.LoadVector128(ptr + i); var k = Sse.LoadVector128(ptrB + i); Sse.Store(ptrS + i, Sse.Add(j, k)); } } } } if (remainder != 0) { AddNaive(a, b, s, length, a.Length); } }
private unsafe void Test44_Intrinsics_V128float_Sqrt(byte[] red, byte[] green, byte[] blue, float[] vv) { int simdLength = Vector128 <float> .Count; int lastIndex = red.Length - (red.Length % simdLength); float *tp = stackalloc float[simdLength]; //var zero = Vector128<float>.Zero; var vm = Vector128 <float> .Zero; fixed(byte *pR = red, pG = green, pB = blue) { for (int i = 0; i < lastIndex; i += simdLength) { var vr = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pG + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pR + i))); var vg = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pB + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pG + i))); var vb = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pR + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pB + i))); vm = Sse.Add(Sse.Multiply(vr, vr), Sse.Multiply(vg, vg)); vm = Sse.Add(vm, Sse.Multiply(vb, vb)); vm = Sse.Sqrt(vm); Sse.Store(tp, vm); for (int m = 0; m < simdLength; m++) { vv[i + m] = tp[m]; } } } Amari(lastIndex, red.Length, red, green, blue, vv); }
private static float HorizontalAddAvx(ReadOnlySpan <float> a) { var remainder = a.Length & 7; var length = a.Length - remainder; var accumulator = Vector128.Create(0f); fixed(float *ptr = a) { for (var i = 0; i < length; i += 8) { var j = Avx.LoadVector256(ptr + i); var x128 = Sse.Add(Avx.ExtractVector128(j, 0), Avx.ExtractVector128(j, 1)); accumulator = Sse3.HorizontalAdd(x128, accumulator); } } var sum = 0f; accumulator = Sse3.HorizontalAdd(Sse3.HorizontalAdd(accumulator, accumulator), accumulator); Sse.StoreScalar(&sum, accumulator); if (remainder != 0) { sum += HorizontalAddNaive(a, length, a.Length); } return(sum); }
public static Vector128 <float> HorizontalAdd(Vector128 <float> left, Vector128 <float> right) { /* * return Vector128.Create( * X(left) + Y(left), * Z(left) + W(left), * X(right) + Y(right), * Z(right) + W(right) * ); * * HorizontalAdd of A - (Ax, Ay, Az, Aw) and B - (Bx, By, Bz, Bw) is * (Ax + Ay, Az + Aw, Bx + By, Bz + Bw) * * So when we don't have hadd instruction, we can just use normal add after getting the vectors * (Ax, Az, Bx, Bz) and (Ay, Aw, By, Bw) * * We explicitly use the Sse methods here as this would be a slow way to do it on the software fallback */ if (Sse3.IsSupported) { return(Sse3.HorizontalAdd(left, right)); } if (Sse.IsSupported) { Vector128 <float> vector1 = Sse.Shuffle(left, right, ShuffleValues.XZXZ); Vector128 <float> vector2 = Sse.Shuffle(left, right, ShuffleValues.YWYW); return(Sse.Add(vector1, vector2)); } return(HorizontalAdd_Software(left, right)); }
static unsafe int Main(string[] args) { int testResult = Pass; if (Sse.IsSupported) { using (TestTable <float> floatTable = new TestTable <float>(new float[4] { 1, -5, 100, 0 }, new float[4] { 22, -1, -50, 0 }, new float[4])) { var vf1 = Unsafe.Read <Vector128 <float> >(floatTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <float> >(floatTable.inArray2Ptr); var vf3 = Sse.Add(vf1, vf2); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => x + y == z)) { Console.WriteLine("SSE Add failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
//↑をオーバーフローしない程度に配列を分割して計算 private unsafe long Test28_Intrinsics_SSE41_DotProduct_float_MT_Kai(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count * 4; //集計用のVector128<float> vTotalで扱える最大要素数 = 1032 //floatの仮数部24bit / byte型最大値 * byte型最大値 //16777215 / (255 * 255) * 4 = 1032.0471 これの小数点以下切り捨てを //1区分あたりの要素数(分割サイズ) int rangeSize = ((1 << 24) - 1) / (byte.MaxValue * byte.MaxValue) * Vector128 <float> .Count;//1032 Parallel.ForEach( Partitioner.Create(0, vs.Length, rangeSize), (range) => { var vTotal = Vector128 <float> .Zero; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 4); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 8); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 12); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる vTotal = Sse.Add(vTotal, dp); } } long subtotal = 0; float *f = stackalloc float[Vector128 <float> .Count]; Sse.Store(f, vTotal); for (int i = 0; i < Vector128 <float> .Count; i++) { subtotal += (long)f[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
public void RunLclFldScenario() { var test = new SimpleBinaryOpTest__AddSingle(); var result = Sse.Add(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { var test = TestStruct.Create(); var result = Sse.Add(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Sse.Add(_fld1, _fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); }
public static Vector4F Add(Vector4FParam1_3 left, Vector4FParam1_3 right) { if (Sse.IsSupported) { return(Sse.Add(left, right)); } return(SoftwareFallbacks.SoftwareFallbacksVector4F.Add_Software(left, right)); }
public void RunLclVarScenario_UnsafeRead() { var left = Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr); var result = Sse.Add(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public static VectorF Add(VectorFParam1_3 left, VectorFParam1_3 right) { if (Sse.IsSupported) { return(Sse.Add(left, right)); } return(Add_Software(left, right)); }
public void RunLclVarScenario_LoadAligned() { var left = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr)); var right = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr)); var result = Sse.Add(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public static Vector128 <float> Add(Vector128 <float> left, Vector128 <float> right) { if (Sse.IsSupported) { return(Sse.Add(left, right)); } return(Add_Software(left, right)); }
public void RunClsVarScenario() { var result = Sse.Add( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Sse.Add( Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr)), Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Sse.Add( Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new SimpleBinaryOpTest__AddSingle(); var result = Sse.Add(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Sse.Add(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public static VectorF Add(VectorFParam1_3 vector, float scalar) { if (Sse.IsSupported) { VectorF expand = Vector128.Create(scalar); return(Sse.Add(vector, expand)); } return(Add_Software(vector, scalar)); }
public static Vector4F Subtract(Vector4FParam1_3 vector, float scalar) { if (Sse.IsSupported) { Vector4F expand = Vector128.Create(scalar); return(Sse.Add(vector, expand)); } return(SoftwareFallbacks.SoftwareFallbacksVector4F.Subtract_Software(vector, scalar)); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var left = Sse.LoadAlignedVector128((Single*)(_dataTable.inArray1Ptr)); var right = Sse.LoadAlignedVector128((Single*)(_dataTable.inArray2Ptr)); var result = Sse.Add(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
static unsafe float fmaTest() { vec a; var b = Vector128.Create(1f); var c = Vector128.Create(2f); var d = Vector128.Create(3f); c = Fma.MultiplyAdd(Sse.LoadVector128((float *)&a), b, c); return(Sse.Add(c, d).ToScalar()); }