static unsafe int Main(string[] args) { int testResult = Pass; if (Sse.IsSupported) { using (TestTable <float> floatTable = new TestTable <float>(new float[4] { 1, -5, 100, 0 }, new float[4] { 22, -1, -50, 3 }, new float[4])) { var vf1 = Unsafe.Read <Vector128 <float> >(floatTable.inArray1Ptr); var vf2 = Unsafe.Read <Vector128 <float> >(floatTable.inArray2Ptr); var vf3 = Sse.AddScalar(vf1, vf2); Unsafe.Write(floatTable.outArrayPtr, vf3); if (!floatTable.CheckResult((x, y, z) => (z[0] == (x[0] + y[0])) && (z[1] == x[1]) && (z[2] == x[2]) && (z[3] == x[3]))) { Console.WriteLine("SSE AddScalar failed on float:"); foreach (var item in floatTable.outArray) { Console.Write(item + ", "); } Console.WriteLine(); testResult = Fail; } } } return(testResult); }
public static Vector4F DistanceSquared4D(Vector4FParam1_3 left, Vector4FParam1_3 right) { if (Sse41.IsSupported) { Vector4F diff = Sse.Subtract(left, right); // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse41.DotProduct(diff, diff, control)); } else if (Sse3.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); mul = Sse3.HorizontalAdd(mul, mul); return(Sse3.HorizontalAdd(mul, mul)); } else if (Sse.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F copy = diff; Vector4F mul = Sse.Multiply(diff, copy); copy = Sse.Shuffle(copy, mul, Helpers.Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Helpers.Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Shuffle(mul, mul, Helpers.Shuffle(2, 2, 2, 2))); } return(DistanceSquared4D_Software(left, right)); }
public void RunStructFldScenario(SimpleBinaryOpTest__AddScalarSingle testClass) { var result = Sse.AddScalar(_fld1, _fld2); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); }
public static VectorF Normalize4D(VectorFParam1_3 vector) { if (Sse41.IsSupported) { // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse.Divide(vector, Sse41.DotProduct(vector, vector, control))); } else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); mul = Sse3.HorizontalAdd(mul, mul); return(Sse.Divide(vector, Sse.Sqrt(Sse3.HorizontalAdd(mul, mul)))); } else if (Sse.IsSupported) { VectorF copy = vector; VectorF mul = Sse.Multiply(vector, copy); copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Divide(vector, Sse.Sqrt(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2))))); } return(Normalize4D_Software(vector)); }
public static VectorF DotProduct4D(VectorFParam1_3 left, VectorFParam1_3 right) { if (Sse41.IsSupported) { // This multiplies the first 4 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_1111_1111; return(Sse41.DotProduct(left, right, control)); } else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(left, right); mul = Sse3.HorizontalAdd(mul, mul); return(Sse3.HorizontalAdd(mul, mul)); } else if (Sse.IsSupported) { VectorF copy = right; VectorF mul = Sse.Multiply(left, copy); copy = Sse.Shuffle(copy, mul, Shuffle(1, 0, 0, 0)); copy = Sse.Add(copy, mul); mul = Sse.Shuffle(mul, copy, Shuffle(0, 3, 0, 0)); mul = Sse.AddScalar(mul, copy); return(Sse.Shuffle(mul, mul, Shuffle(2, 2, 2, 2))); } return(DotProduct4D_Software(left, right)); }
public void RunFldScenario() { var result = Sse.AddScalar(_fld1, _fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArray); }
public void RunStructLclFldScenario() { var test = TestStruct.Create(); var result = Sse.AddScalar(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunLclFldScenario() { var test = new SimpleBinaryOpTest__AddScalarSingle(); var result = Sse.AddScalar(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArray); }
public void RunLclVarScenario_LoadAligned() { var left = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr)); var right = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr)); var result = Sse.AddScalar(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunLclVarScenario() { var left = Unsafe.Read<Vector128<Single>>(_dataTable.inArray1Ptr); var right = Unsafe.Read<Vector128<Single>>(_dataTable.inArray2Ptr); var result = Sse.AddScalar(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArray); }
public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Sse.AddScalar(_fld1, _fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Sse.AddScalar(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Sse.AddScalar( Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr)), Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new SimpleBinaryOpTest__AddScalarSingle(); var result = Sse.AddScalar(test._fld1, test._fld2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { var result = Sse.AddScalar( _clsVar1, _clsVar2 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArray); }
public void RunBasicScenario() { var result = Sse.AddScalar( Unsafe.Read<Vector128<Single>>(_dataTable.inArray1Ptr), Unsafe.Read<Vector128<Single>>(_dataTable.inArray2Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1, _dataTable.inArray2, _dataTable.outArray); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var op1 = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray1Ptr)); var op2 = Sse.LoadAlignedVector128((Single *)(_dataTable.inArray2Ptr)); var result = Sse.AddScalar(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var op1 = Unsafe.Read <Vector128 <Single> >(_dataTable.inArray1Ptr); var op2 = Unsafe.Read <Vector128 <Single> >(_dataTable.inArray2Ptr); var result = Sse.AddScalar(op1, op2); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, _dataTable.outArrayPtr); }
public void RunLclVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_Load)); var left = Sse.LoadVector128((Single *)(_dataTable.inArray1Ptr)); var right = Sse.LoadVector128((Single *)(_dataTable.inArray2Ptr)); var result = Sse.AddScalar(left, right); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(left, right, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Sse.AddScalar( Sse.LoadVector128((Single *)(&test._fld1)), Sse.LoadVector128((Single *)(&test._fld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); }
public void RunStructFldScenario_Load(SimpleBinaryOpTest__AddScalarSingle testClass) { fixed(Vector128 <Single> *pFld1 = &_fld1) fixed(Vector128 <Single> *pFld2 = &_fld2) { var result = Sse.AddScalar( Sse.LoadVector128((Single *)(pFld1)), Sse.LoadVector128((Single *)(pFld2)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, testClass._dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector128 <Single> *pFld1 = &_fld1) fixed(Vector128 <Single> *pFld2 = &_fld2) { var result = Sse.AddScalar( Sse.LoadVector128((Single *)(pFld1)), Sse.LoadVector128((Single *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _dataTable.outArrayPtr); } }
public static Vector4F DistanceSquared2D(Vector4FParam1_3 left, Vector4FParam1_3 right) { // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { Vector4F diff = Sse.Subtract(left, right); // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; return(Sse41.DotProduct(diff, diff, control)); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); // Set W and Z to zero Vector4F result = Sse.And(mul, MaskWAndZToZero); // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?) result = Sse3.HorizontalAdd(result, result); // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z) return(Sse3.MoveLowAndDuplicate(result)); } else if (Sse.IsSupported) { Vector4F diff = Sse.Subtract(left, right); Vector4F mul = Sse.Multiply(diff, diff); Vector4F temp = Sse.Shuffle(mul, mul, Helpers.Shuffle(1, 1, 1, 1)); mul = Sse.AddScalar(mul, temp); mul = Sse.Shuffle(mul, mul, Helpers.Shuffle(0, 0, 0, 0)); return(mul); } return(DistanceSquared2D_Software(left, right)); }
public static VectorF Normalize2D(VectorFParam1_3 vector) { #region Manual Inline // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; VectorF dp = Sse41.DotProduct(vector, vector, control); return(Sse.Divide(vector, Sse.Sqrt(dp))); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); // Set W and Z to zero VectorF result = Sse.And(mul, MaskWAndZToZero); // Add X and Y horizontally, leaving the vector as (X+Y, Y, X+Y. ?) result = Sse3.HorizontalAdd(result, result); // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z) VectorF dp = Sse3.MoveLowAndDuplicate(result); return(Sse.Divide(vector, Sse.Sqrt(dp))); } else if (Sse.IsSupported) { VectorF mul = Sse.Multiply(vector, vector); VectorF temp = Sse.Shuffle(mul, mul, Shuffle(1, 1, 1, 1)); mul = Sse.AddScalar(mul, temp); mul = Sse.Shuffle(mul, mul, Shuffle(0, 0, 0, 0)); return(Sse.Divide(vector, Sse.Sqrt(mul))); } #endregion return(Normalize2D_Software(vector)); }
public static VectorF DotProduct3D(VectorFParam1_3 left, VectorFParam1_3 right) { // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { // This multiplies the first 3 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0111_1111; return(Sse41.DotProduct(left, right, control)); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { VectorF mul = Multiply(left, right); // Set W to zero VectorF result = And(mul, MaskWToZero); // Doubly horizontally adding fills the final vector with the sum result = HorizontalAdd(result, result); return(HorizontalAdd(result, result)); } else if (Sse.IsSupported) { // Multiply to get the needed values VectorF mul = Multiply(left, right); // Shuffle around the values and AddScalar them VectorF temp = Sse.Shuffle(mul, mul, Shuffle(2, 1, 2, 1)); mul = Sse.AddScalar(mul, temp); temp = Sse.Shuffle(temp, temp, Shuffle(1, 1, 1, 1)); mul = Sse.AddScalar(mul, temp); return(Sse.Shuffle(mul, mul, Shuffle(0, 0, 0, 0))); } return(DotProduct3D_Software(left, right)); }
public static Vector128 <float> DotProduct2D(Vector128 <float> left, Vector128 <float> right) { // SSE4.1 has a native dot product instruction, dpps if (Sse41.IsSupported) { // This multiplies the first 2 elems of each and broadcasts it into each element of the returning vector const byte control = 0b_0011_1111; return(Sse41.DotProduct(left, right, control)); } // We can use SSE to vectorize the multiplication // There are different fastest methods to sum the resultant vector // on SSE3 vs SSE1 else if (Sse3.IsSupported) { Vector128 <float> mul = Sse.Multiply(left, right); // Set W to zero Vector128 <float> result = Sse.And(mul, SingleConstants.MaskW); // Add X and Y horizontally, leaving the vector as (X+Y, Z+0, X+Y. Z+0) result = Sse3.HorizontalAdd(result, result); // MoveLowAndDuplicate makes a new vector from (X, Y, Z, W) to (X, X, Z, Z) return(Sse3.MoveLowAndDuplicate(result)); } else if (Sse.IsSupported) { Vector128 <float> mul = Sse.Multiply(left, right); Vector128 <float> temp = Sse.Shuffle(mul, mul, ShuffleValues.YYYY); mul = Sse.AddScalar(mul, temp); mul = Sse.Shuffle(mul, mul, ShuffleValues.XXXX); return(mul); } return(DotProduct2D_Software(left, right)); }
unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy) { float *tp = (float *)tstart, tpe = (float *)(tstart + cb); float *pmapx = (float *)mapxstart; int kstride = smapx * channels; int tstride = smapy * 4; int vcnt = smapx / Vector128 <float> .Count; while (tp < tpe) { int ix = *(int *)pmapx++; int lcnt = vcnt; float *ip = (float *)istart + ix * channels; float *mp = pmapx; pmapx += kstride; Vector128 <float> av0, av1, av2; if (Avx.IsSupported && lcnt >= 2) { Vector256 <float> ax0 = Vector256 <float> .Zero, ax1 = ax0, ax2 = ax0; for (; lcnt >= 2; lcnt -= 2) { var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var iv2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); ip += Vector256 <int> .Count * channels; if (Fma.IsSupported) { ax0 = Fma.MultiplyAdd(Avx.LoadVector256(mp), iv0, ax0); ax1 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count), iv1, ax1); ax2 = Fma.MultiplyAdd(Avx.LoadVector256(mp + Vector256 <float> .Count * 2), iv2, ax2); } else { ax0 = Avx.Add(ax0, Avx.Multiply(iv0, Avx.LoadVector256(mp))); ax1 = Avx.Add(ax1, Avx.Multiply(iv1, Avx.LoadVector256(mp + Vector256 <float> .Count))); ax2 = Avx.Add(ax2, Avx.Multiply(iv2, Avx.LoadVector256(mp + Vector256 <float> .Count * 2))); } mp += Vector256 <float> .Count * channels; } av0 = Sse.Add(ax0.GetLower(), ax1.GetUpper()); av1 = Sse.Add(ax0.GetUpper(), ax2.GetLower()); av2 = Sse.Add(ax1.GetLower(), ax2.GetUpper()); } else { av0 = av1 = av2 = Vector128 <float> .Zero; } for (; lcnt != 0; lcnt--) { var iv0 = Sse.LoadVector128(ip); var iv1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var iv2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); ip += Vector128 <float> .Count * channels; if (Fma.IsSupported) { av0 = Fma.MultiplyAdd(Sse.LoadVector128(mp), iv0, av0); av1 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count), iv1, av1); av2 = Fma.MultiplyAdd(Sse.LoadVector128(mp + Vector128 <float> .Count * 2), iv2, av2); } else { av0 = Sse.Add(av0, Sse.Multiply(iv0, Sse.LoadVector128(mp))); av1 = Sse.Add(av1, Sse.Multiply(iv1, Sse.LoadVector128(mp + Vector128 <float> .Count))); av2 = Sse.Add(av2, Sse.Multiply(iv2, Sse.LoadVector128(mp + Vector128 <float> .Count * 2))); } mp += Vector128 <float> .Count * channels; } var avs0 = Sse.Add(Sse.Add( Sse.Shuffle(av0, av0, 0b_00_10_01_11), Sse.Shuffle(av1, av1, 0b_00_01_11_10)), Sse.Shuffle(av2, av2, 0b_00_11_10_01) ); var avs1 = Sse3.IsSupported ? Sse3.MoveHighAndDuplicate(avs0) : Sse.Shuffle(avs0, avs0, 0b_11_11_01_01); var avs2 = Sse.UnpackHigh(avs0, avs0); tp[0] = Sse.AddScalar(av0, avs0).ToScalar(); tp[1] = Sse.AddScalar(av1, avs1).ToScalar(); tp[2] = Sse.AddScalar(av2, avs2).ToScalar(); tp += tstride; } }
public static __m128 _mm_add_ss(__m128 a, __m128 b) => Sse.AddScalar(a, b);
public static Vector128 <float> _mm_add_ss(Vector128 <float> left, Vector128 <float> right) { return(Sse.AddScalar(left, right)); }