private static int getPadding(int isize, int ksize, int channels) { if (typeof(T) != typeof(float)) { return(0); } int inc = channels == 3 ? 4 : (ksize >= 8 ? HWIntrinsics.VectorCount <T>() : 4) / channels; int pad = MathUtil.DivCeiling(ksize, inc) * inc - ksize; int thresh = channels == 4 ? 1 : HWIntrinsics.IsSupported || channels == 1 ? 2 : 3; return(ksize < thresh || ksize + pad > isize ? 0 : pad); }
unsafe private static void convertFloat(byte *ipstart, byte *opstart, float *igtstart, int cb) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, igt = igtstart; #if HWINTRINSICS if (Avx2.IsSupported && cb >= HWIntrinsics.VectorCount <byte>()) { convertFloatAvx2(ip, ipe, op, igt); } else #endif convertFloatScalar(ip, ipe, op, igt); }
unsafe private static void denoiseLineSse2(byte *pcurr, byte *pprev, byte *pnext, int cb) { byte *ip = pcurr, pp = pprev, np = pnext; nuint cnt = 0, end = (nuint)cb - (nuint)Vector128 <byte> .Count; var voffset = Vector128.Create((byte)0x80); var vthresh = Vector128.Create(denoiseThreshold); LoopTop: do { var vcurr = Sse2.LoadVector128(ip + cnt); var vprev = Sse2.LoadVector128(pp + cnt); var vnext = Sse2.LoadVector128(np + cnt); var vdiffp = Sse2.Or(Sse2.SubtractSaturate(vcurr, vprev), Sse2.SubtractSaturate(vprev, vcurr)); var vmaskp = Sse2.CompareEqual(Sse2.Max(vdiffp, vthresh), vthresh); var vdiffn = Sse2.Or(Sse2.SubtractSaturate(vcurr, vnext), Sse2.SubtractSaturate(vnext, vcurr)); var vmaskn = Sse2.CompareEqual(Sse2.Max(vdiffn, vthresh), vthresh); var vavgp = Sse2.Average(vcurr, vprev); var vavgn = Sse2.Average(vcurr, vnext); var voutval = Sse2.Average(HWIntrinsics.BlendVariable(vavgn, vavgp, vmaskp), HWIntrinsics.BlendVariable(vavgp, vavgn, vmaskn)); var voutmsk = Sse2.Or(vmaskp, vmaskn); voutval = Sse2.Average(voutval, HWIntrinsics.BlendVariable(voutval, Sse2.Average(vprev, vnext), Sse2.And(vmaskp, vmaskn))); var vcurrs = Sse2.Xor(vcurr, voffset).AsSByte(); var vprevs = Sse2.Xor(vprev, voffset).AsSByte(); var vnexts = Sse2.Xor(vnext, voffset).AsSByte(); var vsurlt = Sse2.And(Sse2.CompareGreaterThan(vcurrs, vprevs), Sse2.CompareGreaterThan(vcurrs, vnexts)); var vsurgt = Sse2.And(Sse2.CompareGreaterThan(vprevs, vcurrs), Sse2.CompareGreaterThan(vnexts, vcurrs)); voutmsk = Sse2.And(voutmsk, Sse2.Or(vsurlt, vsurgt).AsByte()); voutval = HWIntrinsics.BlendVariable(vcurr, voutval, voutmsk); Sse2.Store(ip + cnt, voutval); cnt += (nuint)Vector128 <byte> .Count; } while (cnt <= end); if (cnt < end + (nuint)Vector128 <byte> .Count) { cnt = end; goto LoopTop; } }
unsafe private static (uint eql, uint eqr) dedupeLineSse2(byte *pcurr, byte *pprev, byte *penc, int cb, uint bg) { byte *ip = pcurr, pp = pprev, op = penc; nuint cnt = 0, end = (nuint)cb - (nuint)Vector128 <byte> .Count; bool lfound = false; uint eql = 0u, eqr = 0u; var vbg = pp == (byte *)0 ? Vector128.Create(bg) : Vector128 <uint> .Zero; LoopTop: do { var vprev = pp != (byte *)0 ? Sse2.LoadVector128(pp + cnt).AsUInt32() : vbg; var vcurr = Sse2.LoadVector128(ip + cnt).AsUInt32(); var veq = Sse2.CompareEqual(vcurr, vprev); vcurr = HWIntrinsics.BlendVariable(vcurr, vbg, veq); Sse2.Store(op + cnt, vcurr.AsByte()); cnt += (nuint)Vector128 <byte> .Count; uint msk = (uint)Sse2.MoveMask(veq.AsByte()); if (msk == ushort.MinValue) { lfound = true; eqr = 0u; } else if (msk == ushort.MaxValue) { if (!lfound) { eql += (uint)Vector128 <uint> .Count; } eqr += (uint)Vector128 <uint> .Count; } else { msk = ~msk; if (!lfound) { eql += (uint)BitOperations.TrailingZeroCount(msk) / sizeof(uint); lfound = true; } eqr = (uint)BitOperations.LeadingZeroCount(msk) / sizeof(uint); } } while (cnt <= end); if (cnt < end + (nuint)Vector128 <byte> .Count) { uint offs = (uint)(cnt - end) / sizeof(uint); if (!lfound) { eql -= offs; } eqr -= offs; cnt = end; goto LoopTop; } return(eql, eqr); }
unsafe public static void ConvertFloat3A(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb) { Debug.Assert(ipstart == opstart); float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); float *lp = lutstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vgmsk = Avx.BroadcastVector128ToVector256((float *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.GatherMask3x))); var vgmax = Vector256.Create((float)lutmax); var vzero = Vector256 <float> .Zero; var vfone = Vector256.Create(1f); var vione = Vector256.Create(1); ipe -= Vector256 <float> .Count; while (ip <= ipe) { var vf = Avx.Max(vzero, Avx.LoadVector256(ip)); var va = Avx.Shuffle(vf, vf, HWIntrinsics.ShuffleMaskAlpha); vf = Avx.Multiply(vf, Avx.Multiply(vgmax, Avx.Reciprocal(va))); vf = Avx.Min(vf, vgmax); var vi = Avx.ConvertToVector256Int32WithTruncation(vf); var vfi = Avx.ConvertToVector256Single(vi); var vl = Avx2.GatherMaskVector256(vfone, lp, vi, vgmsk, sizeof(float)); var vh = Avx2.GatherMaskVector256(vfone, lp, Avx2.Add(vi, vione), vgmsk, sizeof(float)); vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vfi)); vf = Avx.Multiply(vf, va); Avx.Store(ip, vf); ip += Vector256 <float> .Count; } ipe += Vector256 <float> .Count; } #endif { var vlmax = new Vector4(lutmax); var vzero = Vector4.Zero; float famin = new Vector4(1 / 1024f).X; while (ip < ipe) { var vf = Unsafe.ReadUnaligned <Vector4>(ip); float f3 = vf.W; if (f3 < famin) { Unsafe.WriteUnaligned(ip, vzero); } else { vf = (vf * vlmax / f3).Clamp(vzero, vlmax); float f0 = vf.X; float f1 = vf.Y; float f2 = vf.Z; uint i0 = (uint)f0; uint i1 = (uint)f1; uint i2 = (uint)f2; ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0) * f3; ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1) * f3; ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2) * f3; } ip += 4; } } }
unsafe public static void ConvertFloat(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb) { Debug.Assert(ipstart == opstart); float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); float *lp = lutstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vlmax = Vector256.Create((float)lutmax); var vzero = Vector256 <float> .Zero; var vione = Vector256.Create(1); ipe -= Vector256 <float> .Count; while (ip <= ipe) { var vf = Avx.Multiply(vlmax, Avx.LoadVector256(ip)); vf = Avx.Min(Avx.Max(vzero, vf), vlmax); var vi = Avx.ConvertToVector256Int32WithTruncation(vf); var vp = Avx.ConvertToVector256Single(vi); var vl = Avx2.GatherVector256(lp, vi, sizeof(float)); var vh = Avx2.GatherVector256(lp, Avx2.Add(vi, vione), sizeof(float)); vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vp)); Avx.Store(ip, vf); ip += Vector256 <float> .Count; } ipe += Vector256 <float> .Count; float fmin = vzero.ToScalar(), flmax = vlmax.ToScalar(); while (ip < ipe) { float f = (*ip * flmax).Clamp(fmin, flmax); uint i = (uint)f; *ip++ = Lerp(lp[i], lp[i + 1], f - i); } } else #endif { var vlmax = new Vector4(lutmax); var vzero = Vector4.Zero; ipe -= 4; while (ip <= ipe) { var vf = (Unsafe.ReadUnaligned <Vector4>(ip) * vlmax).Clamp(vzero, vlmax); float f0 = vf.X; float f1 = vf.Y; float f2 = vf.Z; float f3 = vf.W; uint i0 = (uint)f0; uint i1 = (uint)f1; uint i2 = (uint)f2; uint i3 = (uint)f3; ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0); ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1); ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2); ip[3] = Lerp(lp[i3], lp[i3 + 1], f3 - (int)i3); ip += 4; } ipe += 4; float fmin = vzero.X, flmax = vlmax.X; while (ip < ipe) { float f = (*ip * flmax).Clamp(fmin, flmax); uint i = (uint)f; *ip++ = Lerp(lp[i], lp[i + 1], f - i); } } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); byte * op = opstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vzero = Vector256 <float> .Zero; var vmin = Vector256.Create(0.5f / byte.MaxValue); var vscale = Vector256.Create((float)byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32))); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vf0 = Avx.LoadVector256(ip); var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count); var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2); var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3); ip += Vector256 <byte> .Count; var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Avx.Max(vfa0, vmin); vfa1 = Avx.Max(vfa1, vmin); vfa2 = Avx.Max(vfa2, vmin); vfa3 = Avx.Max(vfa3, vmin); vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0)); vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1)); vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2)); vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3)); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin)); vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin)); vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin)); vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin)); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vi0 = Avx.ConvertToVector256Int32(vf0); var vi1 = Avx.ConvertToVector256Int32(vf1); var vi2 = Avx.ConvertToVector256Int32(vf2); var vi3 = Avx.ConvertToVector256Int32(vf3); var vs0 = Avx2.PackSignedSaturate(vi0, vi1); var vs1 = Avx2.PackSignedSaturate(vi2, vi3); var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1); vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte(); Avx.Store(op, vb0); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vzero = Vector128 <float> .Zero; var vmin = Vector128.Create(0.5f / byte.MaxValue); var vscale = Vector128.Create((float)byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vf0 = Sse.LoadVector128(ip); var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count); var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2); var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3); ip += Vector128 <byte> .Count; var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vfa0 = Sse.Max(vfa0, vmin); vfa1 = Sse.Max(vfa1, vmin); vfa2 = Sse.Max(vfa2, vmin); vfa3 = Sse.Max(vfa3, vmin); vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0)); vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1)); vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2)); vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3)); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin)); vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin)); vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin)); vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin)); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vi0 = Sse2.ConvertToVector128Int32(vf0); var vi1 = Sse2.ConvertToVector128Int32(vf1); var vi2 = Sse2.ConvertToVector128Int32(vf2); var vi3 = Sse2.ConvertToVector128Int32(vf3); var vs0 = Sse2.PackSignedSaturate(vi0, vi1); var vs1 = Sse2.PackSignedSaturate(vi2, vi3); var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1); Sse2.Store(op, vb0); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax; while (ip < ipe) { float f3 = ip[3]; if (f3 < fmin) { *(uint *)op = 0; } else { float f3i = fmax / f3; byte o0 = ClampToByte((int)(ip[0] * f3i + fround)); byte o1 = ClampToByte((int)(ip[1] * f3i + fround)); byte o2 = ClampToByte((int)(ip[2] * f3i + fround)); byte o3 = ClampToByte((int)(f3 * fmax + fround)); op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; } ip += 4; op += 4; } }