public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Sse3.AddSubtract(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { var result = Sse3.AddSubtract( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new HorizontalBinaryOpTest__HorizontalAddSingle(); var result = Sse3.HorizontalAdd(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Sse3.AddSubtract( Unsafe.Read <Vector128 <Double> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <Double> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Sse3.AddSubtract( Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray1Ptr)), Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_Load() { var result = Sse3.HorizontalSubtract( Sse.LoadVector128((Single *)(_dataTable.inArray1Ptr)), Sse.LoadVector128((Single *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Sse3.HorizontalAdd( Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new AlternatingBinaryOpTest__AddSubtractDouble(); var result = Sse3.AddSubtract(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
static float ComputeSumSimd(float *arr, int count) { // We're just going to assume that the length of the data is a multiple of 4, otherwise we'd have to handle the // other cases. It's not hard, but tedious. Assert.IsTrue(count % 4 == 0); if (Ssse3.IsSsse3Supported) { // To sum up all values in the array, we split the array into 4 subarrays and store their sums in the variable // `sum` below. v128 sum = new v128(0f); for (int i = 0; i < count; i += 4) { // Load 4 floats from memory. v128 reg = loadu_ps(arr + i); sum = add_ps(sum, reg); } // At this point, we have the sums of 4 subarrays in `sum` and we still need to merge them. SSE3 has a helpful // instruction for this: sum = Sse3.hadd_ps(sum, sum); // Now the first and third lane hold the sum of the first two subarrays and the second and fourth lane contain // the sum of the last two subarrays. sum = Sse3.hadd_ps(sum, sum); // Finally, all four lanes hold the same value (the sum of all subarrays) and we can return the first value // as a float. return(cvtss_f32(sum)); // or alternatively, simply write: // return sum.Float0 + sum.Float1 + sum.Float2 + sum.Float3; } else if (IsNeonSupported) { // Same as above: 4 subarrays to accumulate the sum v128 sum = new v128(0f); for (int i = 0; i < count; i += 4) { // Load 4 floats from memory. v128 reg = vld1q_f32(arr + i); sum = vaddq_f32(sum, reg); } return(vaddvq_f32(sum)); } else { // Managed fallback, equivalent to ComputeSum() float sum = 0; for (int i = 0; i < count; i++) { sum += arr[i]; } return(sum); } }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var op1 = Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray1Ptr)); var op2 = Sse2.LoadAlignedVector128((Double *)(_dataTable.inArray2Ptr)); var result = Sse3.AddSubtract(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load)); var op1 = Sse.LoadVector128((Single *)(_dataTable.inArray1Ptr)); var op2 = Sse.LoadVector128((Single *)(_dataTable.inArray2Ptr)); var result = Sse3.HorizontalAdd(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var left = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr)); var right = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr)); var result = Sse3.HorizontalSubtract(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var op1 = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray1Ptr); var op2 = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray2Ptr); var result = Sse3.HorizontalSubtract(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public static Vector4F HorizontalAdd(Vector4FParam1_3 left, Vector4FParam1_3 right) { if (Sse3.IsSupported) { return(Sse3.HorizontalAdd(left, right)); } // TODO can Sse be used over the software fallback? return(SoftwareFallbacks.SoftwareFallbacksVector4F.HorizontalAdd_Software(left, right)); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var left = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray1Ptr); var right = Unsafe.Read <Vector128 <Double> >(_dataTable.inArray2Ptr); var result = Sse3.HorizontalAdd(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunBasicScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load)); var result = Sse3.HorizontalSubtract( Sse.LoadVector128((Single *)(_dataTable.inArray1Ptr)), Sse.LoadVector128((Single *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario)); var result = Sse3.HorizontalSubtract( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Sse3.HorizontalAdd( Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public static double2 subadd(double2 a, double2 b) { if (Sse3.IsSse3Supported) { v128 temp = Sse3.addsub_pd(*(v128 *)&a, *(v128 *)&b); return(*(double2 *)&temp); } else { return(a - math.select(b, -b, new bool2(false, true))); } }
public static float4 subadd(float4 a, float4 b) { if (Sse3.IsSse3Supported) { v128 temp = Sse3.addsub_ps(*(v128 *)&a, *(v128 *)&b); return(*(float4 *)&temp); } else { return(a - math.select(b, -b, new bool4(false, true, false, true))); } }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Sse3.HorizontalAdd( Sse.LoadVector128((Single *)(&test._fld1)), Sse.LoadVector128((Single *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
private Hit[] RayTraceAVXFaster(Ray ray) { Vector256 <double> dir = (Vector256 <double>)ray.Direction; Vector256 <double> vert0 = (Vector256 <double>)Vert0.Position; Vector256 <double> edge0to1 = (Vector256 <double>)Edge0to1; Vector256 <double> edge0to2 = (Vector256 <double>)Edge0to2; Vector256 <double> offset = Avx.Subtract((Vector256 <double>)ray.Origin, vert0); Vector256 <double> side1 = SIMDHelpers.Cross(offset, edge0to1); Vector256 <double> side2 = SIMDHelpers.Cross(dir, edge0to2); // Prepare all dot products Vector256 <double> uvTemp = Avx.Multiply(offset, side2); // u Vector256 <double> temp = Avx.Multiply(dir, side1); // v Vector256 <double> edge2Temp = Avx.Multiply(edge0to2, side1); Vector256 <double> distTemp = Avx.Multiply(edge0to1, side2); uvTemp = Avx.HorizontalAdd(uvTemp, temp); edge2Temp = Avx.HorizontalAdd(edge2Temp, edge2Temp); distTemp = Avx.HorizontalAdd(distTemp, distTemp); // Complete all dot products for SSE ops Vector128 <double> uvs = SIMDHelpers.Add2(uvTemp); Vector128 <double> dist = SIMDHelpers.Add2(edge2Temp); Vector128 <double> temp1 = SIMDHelpers.Add2(distTemp); Vector128 <double> temp2; // vec2 constants we'll be using later Vector128 <double> ones2 = SIMDHelpers.BroadcastScalar2(1D); Vector128 <double> zeroes2 = new Vector128 <double>(); // Reciprocal of distance along edge0to1 temp1 = Sse2.Divide(ones2, temp1); temp2 = Sse2.CompareOrdered(temp1, temp1); // Remove NaNs from the result, replaced with 0 Vector128 <double> distZeroed = Sse2.And(temp1, temp2); uvs = Sse2.Multiply(uvs, distZeroed); dist = Sse2.Multiply(dist, distZeroed); // compare uvs < 0 and > 1, dist < 0, jump out if any of those conditions are met temp1 = Sse2.CompareLessThan(uvs, zeroes2); temp2 = Mirror ? uvs : Sse3.HorizontalAdd(uvs, uvs); temp2 = Sse2.CompareGreaterThan(temp2, ones2); temp1 = Sse2.Or(temp1, temp2); temp2 = Sse2.CompareLessThan(dist, zeroes2); temp1 = Sse2.Or(temp1, temp2); if (!Avx.TestZ(temp1, temp1)) { return(default);
protected override void Update(GameTime gameTime) { if (GamePad.GetState(PlayerIndex.One).Buttons.Back == ButtonState.Pressed || Keyboard.GetState().IsKeyDown(Keys.Escape)) { Exit(); } var width = _graphics.PreferredBackBufferWidth; var height = _graphics.PreferredBackBufferHeight; var ang1 = gameTime.TotalGameTime.Ticks / 9230000.0F; var aspect = (float)width / height; var look = Matrix4x4.CreateLookAt(new Vector3(0, 0, 60), Vector3.One, Vector3.UnitY); var proj = Matrix4x4.CreatePerspectiveFieldOfView((float)(Math.PI / 3.0), aspect, 1.0F, 1000.0F); var rotationY = Matrix4x4.CreateRotationY(ang1); var rotationX = Matrix4x4.CreateRotationX(0.5F); var comb = rotationY * rotationX * Matrix4x4.CreateScale(10.0F) * look * proj; var m0 = Vector128.Create(comb.M11, comb.M21, comb.M31, comb.M41); var m1 = Vector128.Create(comb.M12, comb.M22, comb.M32, comb.M42); var m2 = Vector128.Create(comb.M13, comb.M23, comb.M33, comb.M43); var m3 = Vector128.Create(comb.M14, comb.M24, comb.M34, comb.M44); var inv = Vector128.Create(1.0F, -1.0F, 1.0F, 1.0F); var half = Vector128.Create(0.5F); var screen = Vector128.Create(width, height, 0.0F, 0.0F); _colorRaster.Clear(-0x1000000); var chunks = _vertices.Length / Environment.ProcessorCount; Parallel.For(0, Environment.ProcessorCount, y => { var offset = y * chunks; for (var l = offset; l < offset + chunks; l++) { var vv = _vertices[l]; var h0 = Sse3.HorizontalAdd(Sse.Multiply(vv, m0), Sse.Multiply(vv, m1)); var h1 = Sse3.HorizontalAdd(Sse.Multiply(vv, m2), Sse.Multiply(vv, m3)); var h3 = Sse.Multiply(inv, Sse3.HorizontalAdd(h0, h1)); var vv2 = Sse.Divide(h3, Vector128.Create(h3.GetElement(3))); var vv4 = Sse.Multiply(screen, Sse.Multiply(half, Sse.Add(Vector128.Create(1.0F), vv2))); var f = Sse2.ConvertToVector128Int32(vv4); var sx = f.GetElement(0); var sy = f.GetElement(1); _colorRaster[sx, sy] = 0xFFFFFF; } }); base.Update(gameTime); }
public void RunStructFldScenario_Load(AlternatingBinaryOpTest__AddSubtractDouble testClass) { fixed(Vector128 <Double> *pFld1 = &_fld1) fixed(Vector128 <Double> *pFld2 = &_fld2) { var result = Sse3.AddSubtract( Sse2.LoadVector128((Double *)(pFld1)), Sse2.LoadVector128((Double *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public void RunStructFldScenario_Load(HorizontalBinaryOpTest__HorizontalAddSingle testClass) { fixed(Vector128 <Single> *pFld1 = &_fld1) fixed(Vector128 <Single> *pFld2 = &_fld2) { var result = Sse3.HorizontalAdd( Sse.LoadVector128((Single *)(pFld1)), Sse.LoadVector128((Single *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public static unsafe void CalculateDiagonalSection_Sse41 <T>(void *refDiag1Ptr, void *refDiag2Ptr, char *sourcePtr, char *targetPtr, ref int rowIndex, int columnIndex) where T : struct { if (typeof(T) == typeof(int)) { var diag1Ptr = (int *)refDiag1Ptr; var diag2Ptr = (int *)refDiag2Ptr; var sourceVector = Sse41.ConvertToVector128Int32((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count); var targetVector = Sse41.ConvertToVector128Int32((ushort *)targetPtr + columnIndex - 1); targetVector = Sse2.Shuffle(targetVector, 0x1b); var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector); var substitutionCost = Sse2.Add( Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count), substitutionCostAdjustment ); var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1)); var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count); var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost); localCost = Sse2.Add(localCost, Vector128.Create(1)); Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost); } else if (typeof(T) == typeof(ushort)) { var diag1Ptr = (ushort *)refDiag1Ptr; var diag2Ptr = (ushort *)refDiag2Ptr; var sourceVector = Sse3.LoadDquVector128((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count); var targetVector = Sse3.LoadDquVector128((ushort *)targetPtr + columnIndex - 1); targetVector = Ssse3.Shuffle(targetVector.AsByte(), REVERSE_USHORT_AS_BYTE_128).AsUInt16(); var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector); var substitutionCost = Sse2.Add( Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count), substitutionCostAdjustment ); var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1)); var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count); var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost); localCost = Sse2.Add(localCost, Vector128.Create((ushort)1)); Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector128 <Double> *pClsVar1 = &_clsVar1) fixed(Vector128 <Double> *pClsVar2 = &_clsVar2) { var result = Sse3.HorizontalSubtract( Sse2.LoadVector128((Double *)(pClsVar1)), Sse2.LoadVector128((Double *)(pClsVar2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <Single> *pFld1 = &_fld1) fixed(Vector128 <Single> *pFld2 = &_fld2) { var result = Sse3.HorizontalAdd( Sse.LoadVector128((Single *)(pFld1)), Sse.LoadVector128((Single *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
public static Vector4F DistanceSquared2D(Vector4FParam1_3 left, Vector4FParam1_3 right) { // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { Vector4F diff = Sse.Subtract(left, right); // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; return(Sse41.DotProduct(diff, diff, control)); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); // Set W and Z to zero Vector4F result = Sse.And(mul, MaskWAndZToZero); // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?) result = Sse3.HorizontalAdd(result, result); // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z) return(Sse3.MoveLowAndDuplicate(result)); } else if (Sse.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); Vector4F temp = Sse.Shuffle(mul, mul, Helpers.Shuffle(1, 1, 1, 1)); mul = Sse.AddScalar(mul, temp); mul = Sse.Shuffle(mul, mul, Helpers.Shuffle(0, 0, 0, 0)); return(mul); } return(DistanceSquared2D_Software(left, right)); }
public static VectorF Normalize2D(VectorFParam1_3 vector) { #region Manual Inline // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; VectorF dp = Sse41.DotProduct(vector, vector, control); return(Sse.Divide(vector, Sse.Sqrt(dp))); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); // Set W and Z to zero VectorF result = Sse.And(mul, MaskWAndZToZero); // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?) result = Sse3.HorizontalAdd(result, result); // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z) VectorF dp = Sse3.MoveLowAndDuplicate(result); return(Sse.Divide(vector, Sse.Sqrt(dp))); } else if (Sse.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); VectorF temp = Sse.Shuffle(mul, mul, Shuffle(1, 1, 1, 1)); mul = Sse.AddScalar(mul, temp); mul = Sse.Shuffle(mul, mul, Shuffle(0, 0, 0, 0)); return(Sse.Divide(vector, Sse.Sqrt(mul))); } #endregion return(Normalize2D_Software(vector)); }