public void RunStructFldScenario(SimpleTernaryOpTest__BlendVariableDouble testClass) { var result = Avx.BlendVariable(_fld1, _fld2, _fld3); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, _fld3, testClass._dataTable.outArrayPtr); }
private static Vector256 <float> ComputeScores(Vector256 <float> vW, Vector256 <float> vN, Vector256 <float> vP, Vector256 <float> vVirtualLossMultiplier, float cpuctSqrtParentN, float uctDenominatorPower, Vector256 <float> vQWhenNoChildren, Vector256 <float> vNInFlight) { Vector256 <float> vNPlusNInFlight = Avx.Add(vN, vNInFlight); Vector256 <float> denominator = uctDenominatorPower switch { 1.0f => vNPlusNInFlight, 0.5f => Avx.Sqrt(vNPlusNInFlight), _ => ToPower(vNPlusNInFlight, uctDenominatorPower) }; Vector256 <float> vLossContrib = Avx.Multiply(vNInFlight, vVirtualLossMultiplier); // Compute U = ((p)(cpuct)(sqrt_parentN)) / (n + n_in_flight + 1) Vector256 <float> vCPUCTSqrtParentN = Vector256.Create(cpuctSqrtParentN); Vector256 <float> vUNumerator = Avx.Multiply(vP, vCPUCTSqrtParentN); Vector256 <float> vDenominator = Avx.Add(vOnes, denominator); Vector256 <float> vU = Avx.Divide(vUNumerator, vDenominator); Vector256 <float> vQWithChildren = Avx.Divide(Avx.Subtract(vLossContrib, vW), vNPlusNInFlight); Vector256 <float> vQWithoutChildren = Avx.Add(vQWhenNoChildren, vLossContrib); Vector256 <float> maskNoChildren = Avx.Compare(vNPlusNInFlight, vZeros, FloatComparisonMode.OrderedGreaterThanSignaling); Vector256 <float> vQ = Avx.BlendVariable(vQWithoutChildren, vQWithChildren, maskNoChildren); Vector256 <float> vScore = Avx.Add(vU, vQ); return(vScore); }
public void RunFldScenario() { var result = Avx.BlendVariable(_fld1, _fld2, _fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr); }
public void RunLclFldScenario() { var test = new SimpleTernaryOpTest__BlendVariableSingle(); var result = Avx.BlendVariable(test._fld1, test._fld2, test._fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { var test = TestStruct.Create(); var result = Avx.BlendVariable(test._fld1, test._fld2, test._fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); }
public void RunClassFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario)); var result = Avx.BlendVariable(_fld1, _fld2, _fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr); }
public void RunClassLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario)); var test = new SimpleTernaryOpTest__BlendVariableDouble(); var result = Avx.BlendVariable(test._fld1, test._fld2, test._fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { var firstOp = Unsafe.Read <Vector256 <Single> >(_dataTable.inArray1Ptr); var secondOp = Unsafe.Read <Vector256 <Single> >(_dataTable.inArray2Ptr); var thirdOp = Unsafe.Read <Vector256 <Single> >(_dataTable.inArray3Ptr); var result = Avx.BlendVariable(firstOp, secondOp, thirdOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { var firstOp = Avx.LoadAlignedVector256((Single *)(_dataTable.inArray1Ptr)); var secondOp = Avx.LoadAlignedVector256((Single *)(_dataTable.inArray2Ptr)); var thirdOp = Avx.LoadAlignedVector256((Single *)(_dataTable.inArray3Ptr)); var result = Avx.BlendVariable(firstOp, secondOp, thirdOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario)); var test = TestStruct.Create(); var result = Avx.BlendVariable(test._fld1, test._fld2, test._fld3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Avx.BlendVariable( Unsafe.Read <Vector256 <Single> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector256 <Single> >(_dataTable.inArray2Ptr), Unsafe.Read <Vector256 <Single> >(_dataTable.inArray3Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Avx.BlendVariable( Avx.LoadAlignedVector256((Single *)(_dataTable.inArray1Ptr)), Avx.LoadAlignedVector256((Single *)(_dataTable.inArray2Ptr)), Avx.LoadAlignedVector256((Single *)(_dataTable.inArray3Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr); }
public void RunClsVarScenario() { var result = Avx.BlendVariable( _clsVar1, _clsVar2, _clsVar3 ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _clsVar3, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var firstOp = Avx.LoadAlignedVector256((Double *)(_dataTable.inArray1Ptr)); var secondOp = Avx.LoadAlignedVector256((Double *)(_dataTable.inArray2Ptr)); var thirdOp = Avx.LoadAlignedVector256((Double *)(_dataTable.inArray3Ptr)); var result = Avx.BlendVariable(firstOp, secondOp, thirdOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, secondOp, thirdOp, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_LoadAligned)); var op1 = Avx.LoadAlignedVector256((Double *)(_dataTable.inArray1Ptr)); var op2 = Avx.LoadAlignedVector256((Double *)(_dataTable.inArray2Ptr)); var op3 = Avx.LoadAlignedVector256((Double *)(_dataTable.inArray3Ptr)); var result = Avx.BlendVariable(op1, op2, op3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, op3, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunLclVarScenario_UnsafeRead)); var op1 = Unsafe.Read <Vector256 <Double> >(_dataTable.inArray1Ptr); var op2 = Unsafe.Read <Vector256 <Double> >(_dataTable.inArray2Ptr); var op3 = Unsafe.Read <Vector256 <Double> >(_dataTable.inArray3Ptr); var result = Avx.BlendVariable(op1, op2, op3); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(op1, op2, op3, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_UnsafeRead)); var result = Avx.BlendVariable( Unsafe.Read <Vector256 <Single> >(_dataTable.inArray1Ptr), Unsafe.Read <Vector256 <Single> >(_dataTable.inArray2Ptr), Unsafe.Read <Vector256 <Single> >(_dataTable.inArray3Ptr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr); }
public void RunBasicScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunBasicScenario_Load)); var result = Avx.BlendVariable( Avx.LoadVector256((Double *)(_dataTable.inArray1Ptr)), Avx.LoadVector256((Double *)(_dataTable.inArray2Ptr)), Avx.LoadVector256((Double *)(_dataTable.inArray3Ptr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArray1Ptr, _dataTable.inArray2Ptr, _dataTable.inArray3Ptr, _dataTable.outArrayPtr); }
public void RunStructLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunStructLclFldScenario_Load)); var test = TestStruct.Create(); var result = Avx.BlendVariable( Avx.LoadVector256((Double *)(&test._fld1)), Avx.LoadVector256((Double *)(&test._fld2)), Avx.LoadVector256((Double *)(&test._fld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); }
/// <summary> /// Absolute error bounded by 1e-4. /// </summary> public static Vector256 <float> Log(Vector256 <float> x) { Vector256 <float> exp, addcst, val; exp = Avx2.ConvertToVector256Single(Avx2.ShiftRightArithmetic(x.As <float, int>(), 23)); // According to BenchmarkDotNet, isolating all the constants up-front // yield nearly 10% speed-up. const float bf0 = -89.970756366f; const float bf1 = float.NaN; // behavior of MathF.Log() on negative numbers const float bf2 = 3.529304993f; const float bf3 = -2.461222105f; const float bf4 = 1.130626167f; const float bf5 = -0.288739945f; const float bf6 = 3.110401639e-2f; const float bf7 = 0.6931471805f; const int bi0 = 0x7FFFFF; const int bi1 = 0x3F800000; //addcst = val > 0 ? -89.970756366f : -(float)INFINITY; addcst = Avx.BlendVariable(Vector256.Create(bf0), Vector256.Create(bf1), Avx.Compare(x, Vector256 <float> .Zero, FloatComparisonMode.OrderedLessThanNonSignaling)); val = Avx2.Or(Avx2.And( x.As <float, int>(), Vector256.Create(bi0)), Vector256.Create(bi1)).As <int, float>(); /* x * (3.529304993f + * x * (-2.461222105f + * x * (1.130626167f + * x * (-0.288739945f + * x * 3.110401639e-2f)))) + (addcst + 0.6931471805f*exp); */ return(Avx2.Add( Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf2), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf3), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf4), Avx2.Multiply(val, Avx2.Add(Vector256.Create(bf5), Avx2.Multiply(val, Vector256.Create(bf6)))))))))), Avx.Add(addcst, Avx2.Multiply(Vector256.Create(bf7), exp)))); }
public void RunStructFldScenario_Load(SimpleTernaryOpTest__BlendVariableDouble testClass) { fixed(Vector256 <Double> *pFld1 = &_fld1) fixed(Vector256 <Double> *pFld2 = &_fld2) fixed(Vector256 <Double> *pFld3 = &_fld3) { var result = Avx.BlendVariable( Avx.LoadVector256((Double *)(pFld1)), Avx.LoadVector256((Double *)(pFld2)), Avx.LoadVector256((Double *)(pFld3)) ); Unsafe.Write(testClass._dataTable.outArrayPtr, result); testClass.ValidateResult(_fld1, _fld2, _fld3, testClass._dataTable.outArrayPtr); } }
public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector256 <Double> *pClsVar1 = &_clsVar1) fixed(Vector256 <Double> *pClsVar2 = &_clsVar2) fixed(Vector256 <Double> *pClsVar3 = &_clsVar3) { var result = Avx.BlendVariable( Avx.LoadVector256((Double *)(pClsVar1)), Avx.LoadVector256((Double *)(pClsVar2)), Avx.LoadVector256((Double *)(pClsVar3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _clsVar3, _dataTable.outArrayPtr); } }
public void RunClassFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassFldScenario_Load)); fixed(Vector256 <Double> *pFld1 = &_fld1) fixed(Vector256 <Double> *pFld2 = &_fld2) fixed(Vector256 <Double> *pFld3 = &_fld3) { var result = Avx.BlendVariable( Avx.LoadVector256((Double *)(pFld1)), Avx.LoadVector256((Double *)(pFld2)), Avx.LoadVector256((Double *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld1, _fld2, _fld3, _dataTable.outArrayPtr); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleTernaryOpTest__BlendVariableSingle(); fixed(Vector256 <Single> *pFld1 = &test._fld1) fixed(Vector256 <Single> *pFld2 = &test._fld2) fixed(Vector256 <Single> *pFld3 = &test._fld3) { var result = Avx.BlendVariable( Avx.LoadVector256((Single *)(pFld1)), Avx.LoadVector256((Single *)(pFld2)), Avx.LoadVector256((Single *)(pFld3)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, test._fld3, _dataTable.outArrayPtr); } }
public (double near, double far) IntersectAVX(Ray ray) { Vector256 <double> origin = (Vector256 <double>)ray.Origin; Vector256 <double> direction = (Vector256 <double>)ray.Direction; Vector256 <double> zeroes = new Vector256 <double>(); Vector256 <double> min = (Vector256 <double>)Minimum; Vector256 <double> max = (Vector256 <double>)Maximum; // Replace slabs that won't be checked (0 direction axis) with infinity so that NaN doesn't propagate Vector256 <double> dirInfMask = Avx.And( Avx.Compare(direction, zeroes, FloatComparisonMode.OrderedEqualNonSignaling), Avx.And( Avx.Compare(origin, min, FloatComparisonMode.OrderedGreaterThanOrEqualNonSignaling), Avx.Compare(origin, max, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling))); min = Avx.BlendVariable(min, SIMDHelpers.BroadcastScalar4(double.NegativeInfinity), dirInfMask); max = Avx.BlendVariable(max, SIMDHelpers.BroadcastScalar4(double.PositiveInfinity), dirInfMask); // Flip slabs in direction axes that are negative (using direction as mask takes the most significant bit, the sign.. probably includes -0) Vector256 <double> minMasked = Avx.BlendVariable(min, max, direction); Vector256 <double> maxMasked = Avx.BlendVariable(max, min, direction); direction = Avx.Divide(Vector256.Create(1D), direction); Vector256 <double> near4 = Avx.Multiply(Avx.Subtract(minMasked, origin), direction); Vector256 <double> far4 = Avx.Multiply(Avx.Subtract(maxMasked, origin), direction); Vector128 <double> near2 = Sse2.Max(near4.GetLower(), near4.GetUpper()); near2 = Sse2.MaxScalar(near2, SIMDHelpers.Swap(near2)); Vector128 <double> far2 = Sse2.Min(far4.GetLower(), far4.GetUpper()); far2 = Sse2.MinScalar(far2, SIMDHelpers.Swap(far2)); if (Sse2.CompareScalarOrderedGreaterThan(near2, far2) | Sse2.CompareScalarOrderedLessThan(far2, new Vector128 <double>())) { return(double.NaN, double.NaN); } return(near2.ToScalar(), far2.ToScalar()); }
public static Vector128 <float> GetBrucePsmeAbgrGrowthEffectiveAge(SiteConstants site, float timeStepInYears, Vector128 <float> treeHeight, out Vector128 <float> potentialHeightGrowth) { Vector128 <float> B1 = AvxExtensions.BroadcastScalarToVector128(site.B1); Vector128 <float> B2 = AvxExtensions.BroadcastScalarToVector128(site.B2); Vector128 <float> X2toB2 = AvxExtensions.BroadcastScalarToVector128(site.X2toB2); Vector128 <float> siteIndexFromGround128 = AvxExtensions.BroadcastScalarToVector128(site.SiteIndexFromGround); Vector128 <float> X1 = AvxExtensions.BroadcastScalarToVector128(site.X1); Vector128 <float> XX1 = Avx.Add(Avx.Divide(MathV.Ln(Avx.Divide(treeHeight, siteIndexFromGround128)), B1), X2toB2); Vector128 <float> xx1lessThanZero = Avx.CompareLessThanOrEqual(XX1, Vector128 <float> .Zero); Vector128 <float> growthEffectiveAge = Avx.Subtract(MathV.Pow(XX1, Avx.Reciprocal(B2)), X1); growthEffectiveAge = Avx.BlendVariable(growthEffectiveAge, AvxExtensions.BroadcastScalarToVector128(500.0F), xx1lessThanZero); Vector128 <float> timeStepInYearsPlusX1 = AvxExtensions.BroadcastScalarToVector128(timeStepInYears + site.X1); Vector128 <float> potentialHeightPower = Avx.Multiply(B1, Avx.Subtract(MathV.Pow(Avx.Add(growthEffectiveAge, timeStepInYearsPlusX1), B2), X2toB2)); Vector128 <float> potentialHeight = Avx.Multiply(siteIndexFromGround128, MathV.Exp(potentialHeightPower)); potentialHeightGrowth = Avx.Subtract(potentialHeight, treeHeight); return(growthEffectiveAge); }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vzero = Vector256 <float> .Zero; var vmin = Vector256.Create(0.5f / byte.MaxValue); var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vf0 = Avx.LoadVector256(ip); var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <byte> .Count; var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Avx.Max(vfa0, vmin); vfa1 = Avx.Max(vfa1, vmin); vfa2 = Avx.Max(vfa2, vmin); vfa3 = Avx.Max(vfa3, vmin); vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0)); vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1)); vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2)); vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3)); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin)); vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin)); vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin)); vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin)); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); Avx.Store(op, vb0); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vzero = Vector128 <float> .Zero; var vmin = Vector128.Create(0.5f / byte.MaxValue); var vscale = Vector128.Create((float)byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vf0 = Sse.LoadVector128(ip); var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3); ip += Vector128 <byte> .Count; var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Sse.Max(vfa0, vmin); vfa1 = Sse.Max(vfa1, vmin); vfa2 = Sse.Max(vfa2, vmin); vfa3 = Sse.Max(vfa3, vmin); vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0)); vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1)); vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2)); vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3)); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin)); vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin)); vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin)); vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin)); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); Sse2.Store(op, vb0); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax; while (ip < ipe) { float f3 = ip[3]; if (f3 < fmin) { *(uint *)op = 0; } else { float f3i = fmax / f3; byte o0 = ClampToByte((int)(ip[0] * f3i + fround)); byte o1 = ClampToByte((int)(ip[1] * f3i + fround)); byte o2 = ClampToByte((int)(ip[2] * f3i + fround)); byte o3 = ClampToByte((int)(f3 * fmax + fround)); op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; } ip += 4; op += 4; } }
public unsafe void Vector256Mandel() { int floatL3Size = TOTALBYTES / sizeof(float); resolutionX = (int)MathF.Floor(MathF.Sqrt(floatL3Size * ratioy_x)); if (resolutionX % 8 != 0) { resolutionX -= resolutionX % 8; } resolutionY = (int)MathF.Floor(resolutionX * ratioy_x); if (resolutionY % 8 != 0) { resolutionY -= resolutionY % 8; } STEP_X = (RIGHT_X - LEFT_X) / resolutionX; STEP_Y = STEP_X; // ratioy_x * STEP_X; Bug from reddit comment numberOfPoints = resolutionX * resolutionY; results2 = new float[numberOfPoints]; xPoints = new float[resolutionX]; yPoints = new float[resolutionY]; for (int i = 0; i < resolutionX; i++) { xPoints.Span[i] = LEFT_X + i * STEP_X; } for (int i = 0; i < resolutionY; i++) { yPoints.Span[i] = TOP_Y - i * STEP_Y; } int countX = 0, countY = 0; int maxInter = 256; int inter; ReadOnlySpan <float> ySpan = yPoints.Span;// MemoryMarshal.Cast<float, Vector256<float>>(yPoints.Span); ReadOnlySpan <Vector256 <float> > xSpan = MemoryMarshal.Cast <float, Vector256 <float> >(xPoints.Span); Span <Vector256 <float> > res = MemoryMarshal.Cast <float, Vector256 <float> >(results2.Span); Span <Vector256 <float> > testSpan = MemoryMarshal.Cast <float, Vector256 <float> >(testValue2.Span); int resVectorNumber = 0; Vector256 <float> xVec, yVec; var oneVec = Vector256.Create(1.0f); var fourVec = Vector256.Create(4.0f); while (countY < ySpan.Length) { var currYVec = Vector256.Create(ySpan[countY]); while (countX < xSpan.Length) { Vector256 <float> currXVec = xSpan[countX]; var xSquVec = Vector256.Create(0.0f); var ySquVec = Vector256.Create(0.0f); var zSquVec = Vector256.Create(0.0f); var interVec = Vector256.Create(0.0f); Vector256 <float> sumVector = oneVec; inter = 0; bool goOn = true; while (goOn) { xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec); yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec); xSquVec = Avx.Multiply(xVec, xVec); ySquVec = Avx.Multiply(yVec, yVec); zSquVec = Avx.Multiply(Avx.Add(xVec, yVec), Avx.Add(xVec, yVec)); Vector256 <float> test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0? sumVector = Avx.BlendVariable(Vector256 <float> .Zero, sumVector, test); // selects from second if true, from first otherwise goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value? if (goOn) { interVec = Avx.Add(interVec, sumVector); } inter = goOn ? inter + 1 : inter; } testSpan[resVectorNumber] = Avx.Add(xSquVec, ySquVec); res[resVectorNumber] = interVec; resVectorNumber++; countX++; } countX = 0; countY++; } }
public unsafe void Vector256Mandel() { int countX = 0, countY = 0; int maxInter = 256; int inter; ReadOnlySpan <float> ySpan = yPoints.Span; ReadOnlySpan <Vector256 <float> > xSpan = MemoryMarshal.Cast <float, Vector256 <float> >(xPoints.Span); Span <Vector256 <float> > res = MemoryMarshal.Cast <float, Vector256 <float> >(results.Span); int resVectorNumber = 0; Vector256 <float> xVec, yVec; Vector256 <float> zeroVec = Vector256 <float> .Zero; var oneVec = Vector256.Create(1.0f); var fourVec = Vector256.Create(4.0f); var one4Vec = Vector256.Create(0.25f); var one16Vec = Vector256.Create(1.0f / 16.0f); Vector256 <float> qVec; Vector256 <float> test; while (countY < ySpan.Length) { var currYVec = Vector256.Create(ySpan[countY]); while (countX < xSpan.Length) { Vector256 <float> currXVec = xSpan[countX]; Vector256 <float> xSquVec = zeroVec; Vector256 <float> ySquVec = zeroVec; Vector256 <float> zSquVec = zeroVec; Vector256 <float> interVec = zeroVec; Vector256 <float> sumVector; inter = 0; bool goOn; Vector256 <float> temp = Avx.Subtract(currXVec, one4Vec); Vector256 <float> temp1 = Avx.Multiply(currYVec, currYVec); qVec = Avx.Add(Avx.Multiply(temp, temp), temp1); Vector256 <float> temp2 = Avx.Multiply(qVec, Avx.Add(qVec, temp)); test = Avx.Compare(temp2, Avx.Multiply(one4Vec, temp1), FloatComparisonMode.OrderedGreaterThanNonSignaling); goOn = (Avx.MoveMask(test) > 0); if (goOn) { temp2 = Avx.Add(currXVec, oneVec); temp = Avx.Add(Avx.Multiply(temp2, temp2), temp1); test = Avx.Compare(temp, one16Vec, FloatComparisonMode.OrderedGreaterThanNonSignaling); goOn = Avx.MoveMask(test) > 0; if (!goOn) { interVec = Vector256.Create(255.0f); // make all point = maximum value } } while (goOn) { xVec = Avx.Add(Avx.Subtract(xSquVec, ySquVec), currXVec); yVec = Avx.Add(Avx.Subtract(Avx.Subtract(zSquVec, ySquVec), xSquVec), currYVec); xSquVec = Avx.Multiply(xVec, xVec); ySquVec = Avx.Multiply(yVec, yVec); temp = Avx.Add(xVec, yVec); zSquVec = Avx.Multiply(temp, temp); test = Avx.Compare(Avx.Add(xSquVec, ySquVec), fourVec, FloatComparisonMode.OrderedLessThanOrEqualNonSignaling); // <= 4.0? sumVector = Avx.BlendVariable(zeroVec, oneVec, test); goOn = (Avx.MoveMask(test) > 0) & (inter < maxInter); //any of the values still alive, and inter still below cutoff value? if (goOn) { interVec = Avx.Add(interVec, sumVector); } inter = goOn ? inter + 1 : inter; } res[resVectorNumber] = interVec; resVectorNumber++; countX++; } countX = 0; countY++; } }
// Select public static f32 Select_f32(m32 m, f32 a, f32 b) { return(Avx.BlendVariable(b, a, m.AsSingle())); }