public void RunClsVarScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClsVarScenario_Load)); fixed(Vector256 <Double> *pClsVar1 = &_clsVar1) fixed(Vector256 <Double> *pClsVar2 = &_clsVar2) { var result = Avx.Max( Avx.LoadVector256((Double *)(pClsVar1)), Avx.LoadVector256((Double *)(pClsVar2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_clsVar1, _clsVar2, _dataTable.outArrayPtr); } }
unsafe private static void greyLinearToGreyFloat(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb), op = (float *)opstart; #if HWINTRINSICS if (Avx.IsSupported) { var vzero = Vector256 <float> .Zero; ipe -= Vector256 <float> .Count; while (ip <= ipe) { var v = Avx.Max(vzero, Avx.LoadVector256(ip)); ip += Vector256 <float> .Count; v = Avx.Sqrt(v); Avx.Store(op, v); op += Vector256 <float> .Count; } ipe += Vector256 <float> .Count; } else #endif { var vzero = Vector <float> .Zero; ipe -= VectorF.Count; while (ip <= ipe) { var v = Unsafe.ReadUnaligned <VectorF>(ip); ip += VectorF.Count; v = Vector.SquareRoot(Vector.Max(v, vzero)); Unsafe.WriteUnaligned(op, v); op += VectorF.Count; } ipe += VectorF.Count; } float fmin = Vector4.Zero.X; while (ip < ipe) { *op++ = MaxF(*ip++, fmin).Sqrt(); } }
public void RunClassLclFldScenario_Load() { TestLibrary.TestFramework.BeginScenario(nameof(RunClassLclFldScenario_Load)); var test = new SimpleBinaryOpTest__MaxSingle(); fixed(Vector256 <Single> *pFld1 = &test._fld1) fixed(Vector256 <Single> *pFld2 = &test._fld2) { var result = Avx.Max( Avx.LoadVector256((Single *)(pFld1)), Avx.LoadVector256((Single *)(pFld2)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld1, test._fld2, _dataTable.outArrayPtr); } }
private static unsafe double[] BilinearInterpol_AVX( double[] x, double[] A, double minXA, double maxXA, double[] B, double minXB, double maxXB, double weightB) { double[] z = new double[outputVectorSize]; fixed(double *pX = &x[0], pA = &A[0], pB = &B[0], pZ = &z[0]) { Vector256 <double> vWeightB = Vector256.Create(weightB); Vector256 <double> vWeightA = Vector256.Create(1 - weightB); Vector256 <double> vMinXA = Vector256.Create(minXA); Vector256 <double> vMaxXA = Vector256.Create(maxXA); Vector256 <double> vMinXB = Vector256.Create(minXB); Vector256 <double> vMaxXB = Vector256.Create(maxXB); double deltaA = (maxXA - minXA) / (double)(A.Length - 1); double deltaB = (maxXB - minXB) / (double)(B.Length - 1); Vector256 <double> vDeltaA = Vector256.Create(deltaA); Vector256 <double> vDeltaB = Vector256.Create(deltaB); double invDeltaA = 1.0 / deltaA; double invDeltaB = 1.0 / deltaB; Vector256 <double> vInvDeltaA = Vector256.Create(invDeltaA); Vector256 <double> vInvDeltaB = Vector256.Create(invDeltaB); Vector128 <int> ALengthMinusOne = Vector128.Create(A.Length - 1); Vector128 <int> BLengthMinusOne = Vector128.Create(B.Length - 1); Vector128 <int> One = Vector128.Create(1); for (var i = 0; i < x.Length; i += Vector256 <double> .Count) { Vector256 <double> currentX = Avx.LoadVector256(pX + i); // Determine the largest a, such that A[i] = f(xA) and xA <= x[i]. // This involves casting from double to int; here we use a Vector conversion. Vector256 <double> aDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXA), vInvDeltaA); Vector128 <int> a = Avx.ConvertToVector128Int32WithTruncation(aDouble); a = Sse41.Min(Sse41.Max(a, Vector128 <int> .Zero), ALengthMinusOne); Vector128 <int> aPlusOne = Sse41.Min(Sse2.Add(a, One), ALengthMinusOne); // Now, get the reference input, xA, for our index a. // This involves casting from int to double. Vector256 <double> xA = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(a), vDeltaA), vMinXA); // Now, compute the lambda for our A reference point. Vector256 <double> currentXNormA = Avx.Max(vMinXA, Avx.Min(currentX, vMaxXA)); Vector256 <double> lambdaA = Avx.Multiply(Avx.Subtract(currentXNormA, xA), vInvDeltaA); // Now, we need to load up our reference points using Vector Gather operations. Vector256 <double> AVector = Avx2.GatherVector256(pA, a, 8); Vector256 <double> AVectorPlusOne = Avx2.GatherVector256(pA, aPlusOne, 8); // Now, do the all of the above for our B reference point. Vector256 <double> bDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXB), vInvDeltaB); Vector128 <int> b = Avx.ConvertToVector128Int32WithTruncation(bDouble); b = Sse41.Min(Sse41.Max(b, Vector128 <int> .Zero), BLengthMinusOne); Vector128 <int> bPlusOne = Sse41.Min(Sse2.Add(b, One), BLengthMinusOne); Vector256 <double> xB = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(b), vDeltaB), vMinXB); Vector256 <double> currentXNormB = Avx.Max(vMinXB, Avx.Min(currentX, vMaxXB)); Vector256 <double> lambdaB = Avx.Multiply(Avx.Subtract(currentXNormB, xB), vInvDeltaB); Vector256 <double> BVector = Avx2.GatherVector256(pB, b, 8); Vector256 <double> BVectorPlusOne = Avx2.GatherVector256(pB, bPlusOne, 8); Vector256 <double> newZ = Avx.Add(Avx.Multiply(vWeightA, Avx.Add(AVector, Avx.Multiply(lambdaA, Avx.Subtract(AVectorPlusOne, AVector)))), Avx.Multiply(vWeightB, Avx.Add(BVector, Avx.Multiply(lambdaB, Avx.Subtract(BVectorPlusOne, BVector))))); Avx.Store(pZ + i, newZ); } } return(z); }
public static f32 Max_f32(f32 a, f32 b) { return(Avx.Max(a, b)); }
unsafe public static void ConvertFloat3A(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb) { Debug.Assert(ipstart == opstart); float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); float *lp = lutstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vgmsk = Avx.BroadcastVector128ToVector256((float *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.GatherMask3x))); var vgmax = Vector256.Create((float)lutmax); var vzero = Vector256 <float> .Zero; var vfone = Vector256.Create(1f); var vione = Vector256.Create(1); ipe -= Vector256 <float> .Count; while (ip <= ipe) { var vf = Avx.Max(vzero, Avx.LoadVector256(ip)); var va = Avx.Shuffle(vf, vf, HWIntrinsics.ShuffleMaskAlpha); vf = Avx.Multiply(vf, Avx.Multiply(vgmax, Avx.Reciprocal(va))); vf = Avx.Min(vf, vgmax); var vi = Avx.ConvertToVector256Int32WithTruncation(vf); var vfi = Avx.ConvertToVector256Single(vi); var vl = Avx2.GatherMaskVector256(vfone, lp, vi, vgmsk, sizeof(float)); var vh = Avx2.GatherMaskVector256(vfone, lp, Avx2.Add(vi, vione), vgmsk, sizeof(float)); vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vfi)); vf = Avx.Multiply(vf, va); Avx.Store(ip, vf); ip += Vector256 <float> .Count; } ipe += Vector256 <float> .Count; } #endif { var vlmax = new Vector4(lutmax); var vzero = Vector4.Zero; float famin = new Vector4(1 / 1024f).X; while (ip < ipe) { var vf = Unsafe.ReadUnaligned <Vector4>(ip); float f3 = vf.W; if (f3 < famin) { Unsafe.WriteUnaligned(ip, vzero); } else { vf = (vf * vlmax / f3).Clamp(vzero, vlmax); float f0 = vf.X; float f1 = vf.Y; float f2 = vf.Z; uint i0 = (uint)f0; uint i1 = (uint)f1; uint i2 = (uint)f2; ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0) * f3; ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1) * f3; ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2) * f3; } ip += 4; } } }
unsafe public static void ConvertFloat(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb) { Debug.Assert(ipstart == opstart); float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); float *lp = lutstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vlmax = Vector256.Create((float)lutmax); var vzero = Vector256 <float> .Zero; var vione = Vector256.Create(1); ipe -= Vector256 <float> .Count; while (ip <= ipe) { var vf = Avx.Multiply(vlmax, Avx.LoadVector256(ip)); vf = Avx.Min(Avx.Max(vzero, vf), vlmax); var vi = Avx.ConvertToVector256Int32WithTruncation(vf); var vp = Avx.ConvertToVector256Single(vi); var vl = Avx2.GatherVector256(lp, vi, sizeof(float)); var vh = Avx2.GatherVector256(lp, Avx2.Add(vi, vione), sizeof(float)); vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vp)); Avx.Store(ip, vf); ip += Vector256 <float> .Count; } ipe += Vector256 <float> .Count; float fmin = vzero.ToScalar(), flmax = vlmax.ToScalar(); while (ip < ipe) { float f = (*ip * flmax).Clamp(fmin, flmax); uint i = (uint)f; *ip++ = Lerp(lp[i], lp[i + 1], f - i); } } else #endif { var vlmax = new Vector4(lutmax); var vzero = Vector4.Zero; ipe -= 4; while (ip <= ipe) { var vf = (Unsafe.ReadUnaligned <Vector4>(ip) * vlmax).Clamp(vzero, vlmax); float f0 = vf.X; float f1 = vf.Y; float f2 = vf.Z; float f3 = vf.W; uint i0 = (uint)f0; uint i1 = (uint)f1; uint i2 = (uint)f2; uint i3 = (uint)f3; ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0); ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1); ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2); ip[3] = Lerp(lp[i3], lp[i3 + 1], f3 - (int)i3); ip += 4; } ipe += 4; float fmin = vzero.X, flmax = vlmax.X; while (ip < ipe) { float f = (*ip * flmax).Clamp(fmin, flmax); uint i = (uint)f; *ip++ = Lerp(lp[i], lp[i + 1], f - i); } } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vzero = Vector256 <float> .Zero; var vmin = Vector256.Create(0.5f / byte.MaxValue); var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vf0 = Avx.LoadVector256(ip); var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <byte> .Count; var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Avx.Max(vfa0, vmin); vfa1 = Avx.Max(vfa1, vmin); vfa2 = Avx.Max(vfa2, vmin); vfa3 = Avx.Max(vfa3, vmin); vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0)); vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1)); vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2)); vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3)); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin)); vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin)); vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin)); vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin)); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); Avx.Store(op, vb0); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vzero = Vector128 <float> .Zero; var vmin = Vector128.Create(0.5f / byte.MaxValue); var vscale = Vector128.Create((float)byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vf0 = Sse.LoadVector128(ip); var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3); ip += Vector128 <byte> .Count; var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Sse.Max(vfa0, vmin); vfa1 = Sse.Max(vfa1, vmin); vfa2 = Sse.Max(vfa2, vmin); vfa3 = Sse.Max(vfa3, vmin); vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0)); vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1)); vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2)); vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3)); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin)); vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin)); vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin)); vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin)); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); Sse2.Store(op, vb0); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax; while (ip < ipe) { float f3 = ip[3]; if (f3 < fmin) { *(uint *)op = 0; } else { float f3i = fmax / f3; byte o0 = ClampToByte((int)(ip[0] * f3i + fround)); byte o1 = ClampToByte((int)(ip[1] * f3i + fround)); byte o2 = ClampToByte((int)(ip[2] * f3i + fround)); byte o3 = ClampToByte((int)(f3 * fmax + fround)); op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; } ip += 4; op += 4; } }
unsafe void IConvolver.SharpenLine(byte *cstart, byte *ystart, byte *bstart, byte *ostart, int ox, int ow, float amt, float thresh, bool gamma) { float *ip = (float *)cstart + (uint)ox * channels, yp = (float *)ystart + (uint)ox, bp = (float *)bstart, op = (float *)ostart; float *ipe = ip + (uint)ow * channels; bool threshold = thresh > 0f; if (Avx.IsSupported && ip <= ipe - VectorAvx.Count) { var vthresh = Vector256.Create(threshold ? thresh : -1f); var vmsk = Vector256.Create(0x7fffffff).AsSingle(); var vamt = Vector256.Create(amt); var vmin = VectorAvx.Zero; ipe -= VectorAvx.Count; do { var vd = Avx.Subtract(Avx.LoadVector256(yp), Avx.LoadVector256(bp)); yp += VectorAvx.Count; bp += VectorAvx.Count; if (threshold) { var sm = HWIntrinsics.AvxCompareGreaterThan(Avx.And(vd, vmsk), vthresh); vd = Avx.And(vd, sm); } vd = Avx.Multiply(vd, vamt); var v0 = Avx.LoadVector256(ip); ip += VectorAvx.Count; if (gamma) { v0 = Avx.Max(v0, vmin); v0 = Avx.Multiply(v0, Avx.ReciprocalSqrt(v0)); v0 = Avx.Add(v0, vd); v0 = Avx.Max(v0, vmin); v0 = Avx.Multiply(v0, v0); } else { v0 = Avx.Add(v0, vd); } Avx.Store(op, v0); op += VectorAvx.Count; } while (ip <= ipe); ipe += VectorAvx.Count; } else if (ip <= ipe - VectorSse.Count) { var vthresh = Vector128.Create(threshold ? thresh : -1f); var vmsk = Vector128.Create(0x7fffffff).AsSingle(); var vamt = Vector128.Create(amt); var vmin = VectorSse.Zero; ipe -= VectorSse.Count; do { var vd = Sse.Subtract(Sse.LoadVector128(yp), Sse.LoadVector128(bp)); yp += VectorSse.Count; bp += VectorSse.Count; if (threshold) { var sm = Sse.CompareGreaterThan(Sse.And(vd, vmsk), vthresh); vd = Sse.And(vd, sm); } vd = Sse.Multiply(vd, vamt); var v0 = Sse.LoadVector128(ip); ip += VectorSse.Count; if (gamma) { v0 = Sse.Max(v0, vmin); v0 = Sse.Multiply(v0, Sse.ReciprocalSqrt(v0)); v0 = Sse.Add(v0, vd); v0 = Sse.Max(v0, vmin); v0 = Sse.Multiply(v0, v0); } else { v0 = Sse.Add(v0, vd); } Sse.Store(op, v0); op += VectorSse.Count; } while (ip <= ipe); ipe += VectorSse.Count; } float fmin = VectorSse.Zero.ToScalar(); while (ip < ipe) { float dif = *yp++ - *bp++; float c0 = *ip++; if (!threshold || Math.Abs(dif) > thresh) { dif *= amt; if (gamma) { c0 = MathUtil.MaxF(c0, fmin).Sqrt(); c0 = MathUtil.MaxF(c0 + dif, fmin); c0 *= c0; } else { c0 += dif; } } *op++ = c0; } }
public static unsafe float Max(this Matrix <float> matrix) { var i = 0; fixed(float *ptr = matrix.GetArray()) { var span = new Span <float>(ptr, matrix.Length); var maxScalar = span[0]; if (Avx.IsSupported) { var maxValues = stackalloc float[8] { span[0], span[0], span[0], span[0], span[0], span[0], span[0], span[0] }; var max = Avx.LoadVector256(maxValues); while (i < span.Length - 8) { var vector256 = Avx.LoadVector256(ptr + i); max = Avx.Max(vector256, max); i += 8; } maxScalar = max.MaxVector256(8); } else if (Sse.IsSupported) { var maxValues = stackalloc float[4] { span[0], span[0], span[0], span[0] }; var max = Sse.LoadVector128(maxValues); while (i < span.Length - 4) { var vector128 = Sse.LoadVector128(ptr + i); max = Sse.Max(vector128, max); i += 4; } maxScalar = max.MaxVector128(4); } while (i < span.Length) { if (maxScalar < span[i]) { maxScalar = span[i]; } i++; } return(maxScalar); } }