コード例 #1
0
ファイル: KernelMap.cs プロジェクト: sandboxorg/PhotoSauce
        private static int getPadding(int isize, int ksize, int channels)
        {
            if (typeof(T) != typeof(float))
            {
                return(0);
            }

            int inc    = channels == 3 ? 4 : (ksize >= 8 ? HWIntrinsics.VectorCount <T>() : 4) / channels;
            int pad    = MathUtil.DivCeiling(ksize, inc) * inc - ksize;
            int thresh = channels == 4 ? 1 : HWIntrinsics.IsSupported || channels == 1 ? 2 : 3;

            return(ksize < thresh || ksize + pad > isize ? 0 : pad);
        }
コード例 #2
0
            unsafe private static void convertFloat(byte *ipstart, byte *opstart, float *igtstart, int cb)
            {
                byte * ip = ipstart, ipe = ipstart + cb;
                float *op = (float *)opstart, igt = igtstart;

#if HWINTRINSICS
                if (Avx2.IsSupported && cb >= HWIntrinsics.VectorCount <byte>())
                {
                    convertFloatAvx2(ip, ipe, op, igt);
                }
                else
#endif
                convertFloatScalar(ip, ipe, op, igt);
            }
コード例 #3
0
ファイル: TemporalFilters.cs プロジェクト: wk-j/PhotoSauce
        unsafe private static void denoiseLineSse2(byte *pcurr, byte *pprev, byte *pnext, int cb)
        {
            byte *ip = pcurr, pp = pprev, np = pnext;
            nuint cnt = 0, end = (nuint)cb - (nuint)Vector128 <byte> .Count;

            var voffset = Vector128.Create((byte)0x80);
            var vthresh = Vector128.Create(denoiseThreshold);

LoopTop:
            do
            {
                var vcurr = Sse2.LoadVector128(ip + cnt);
                var vprev = Sse2.LoadVector128(pp + cnt);
                var vnext = Sse2.LoadVector128(np + cnt);

                var vdiffp = Sse2.Or(Sse2.SubtractSaturate(vcurr, vprev), Sse2.SubtractSaturate(vprev, vcurr));
                var vmaskp = Sse2.CompareEqual(Sse2.Max(vdiffp, vthresh), vthresh);

                var vdiffn = Sse2.Or(Sse2.SubtractSaturate(vcurr, vnext), Sse2.SubtractSaturate(vnext, vcurr));
                var vmaskn = Sse2.CompareEqual(Sse2.Max(vdiffn, vthresh), vthresh);

                var vavgp = Sse2.Average(vcurr, vprev);
                var vavgn = Sse2.Average(vcurr, vnext);

                var voutval = Sse2.Average(HWIntrinsics.BlendVariable(vavgn, vavgp, vmaskp), HWIntrinsics.BlendVariable(vavgp, vavgn, vmaskn));
                var voutmsk = Sse2.Or(vmaskp, vmaskn);
                voutval = Sse2.Average(voutval, HWIntrinsics.BlendVariable(voutval, Sse2.Average(vprev, vnext), Sse2.And(vmaskp, vmaskn)));

                var vcurrs = Sse2.Xor(vcurr, voffset).AsSByte();
                var vprevs = Sse2.Xor(vprev, voffset).AsSByte();
                var vnexts = Sse2.Xor(vnext, voffset).AsSByte();

                var vsurlt = Sse2.And(Sse2.CompareGreaterThan(vcurrs, vprevs), Sse2.CompareGreaterThan(vcurrs, vnexts));
                var vsurgt = Sse2.And(Sse2.CompareGreaterThan(vprevs, vcurrs), Sse2.CompareGreaterThan(vnexts, vcurrs));

                voutmsk = Sse2.And(voutmsk, Sse2.Or(vsurlt, vsurgt).AsByte());
                voutval = HWIntrinsics.BlendVariable(vcurr, voutval, voutmsk);

                Sse2.Store(ip + cnt, voutval);
                cnt += (nuint)Vector128 <byte> .Count;
            } while (cnt <= end);

            if (cnt < end + (nuint)Vector128 <byte> .Count)
            {
                cnt = end;
                goto LoopTop;
            }
        }
コード例 #4
0
ファイル: TemporalFilters.cs プロジェクト: wk-j/PhotoSauce
        unsafe private static (uint eql, uint eqr) dedupeLineSse2(byte *pcurr, byte *pprev, byte *penc, int cb, uint bg)
        {
            byte *ip = pcurr, pp = pprev, op = penc;
            nuint cnt = 0, end = (nuint)cb - (nuint)Vector128 <byte> .Count;

            bool lfound = false;
            uint eql = 0u, eqr = 0u;
            var  vbg = pp == (byte *)0 ? Vector128.Create(bg) : Vector128 <uint> .Zero;

LoopTop:
            do
            {
                var vprev = pp != (byte *)0 ? Sse2.LoadVector128(pp + cnt).AsUInt32() : vbg;
                var vcurr = Sse2.LoadVector128(ip + cnt).AsUInt32();

                var veq = Sse2.CompareEqual(vcurr, vprev);
                vcurr = HWIntrinsics.BlendVariable(vcurr, vbg, veq);

                Sse2.Store(op + cnt, vcurr.AsByte());
                cnt += (nuint)Vector128 <byte> .Count;

                uint msk = (uint)Sse2.MoveMask(veq.AsByte());
                if (msk == ushort.MinValue)
                {
                    lfound = true;
                    eqr    = 0u;
                }
                else if (msk == ushort.MaxValue)
                {
                    if (!lfound)
                    {
                        eql += (uint)Vector128 <uint> .Count;
                    }

                    eqr += (uint)Vector128 <uint> .Count;
                }
                else
                {
                    msk = ~msk;
                    if (!lfound)
                    {
                        eql   += (uint)BitOperations.TrailingZeroCount(msk) / sizeof(uint);
                        lfound = true;
                    }

                    eqr = (uint)BitOperations.LeadingZeroCount(msk) / sizeof(uint);
                }
            } while (cnt <= end);

            if (cnt < end + (nuint)Vector128 <byte> .Count)
            {
                uint offs = (uint)(cnt - end) / sizeof(uint);
                if (!lfound)
                {
                    eql -= offs;
                }
                eqr -= offs;
                cnt  = end;
                goto LoopTop;
            }

            return(eql, eqr);
        }
コード例 #5
0
            unsafe public static void ConvertFloat3A(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb)
            {
                Debug.Assert(ipstart == opstart);

                float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb);
                float *lp = lutstart;

#if HWINTRINSICS
                if (Avx2.IsSupported)
                {
                    var vgmsk = Avx.BroadcastVector128ToVector256((float *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.GatherMask3x)));
                    var vgmax = Vector256.Create((float)lutmax);
                    var vzero = Vector256 <float> .Zero;
                    var vfone = Vector256.Create(1f);
                    var vione = Vector256.Create(1);

                    ipe -= Vector256 <float> .Count;
                    while (ip <= ipe)
                    {
                        var vf = Avx.Max(vzero, Avx.LoadVector256(ip));
                        var va = Avx.Shuffle(vf, vf, HWIntrinsics.ShuffleMaskAlpha);

                        vf = Avx.Multiply(vf, Avx.Multiply(vgmax, Avx.Reciprocal(va)));
                        vf = Avx.Min(vf, vgmax);

                        var vi  = Avx.ConvertToVector256Int32WithTruncation(vf);
                        var vfi = Avx.ConvertToVector256Single(vi);

                        var vl = Avx2.GatherMaskVector256(vfone, lp, vi, vgmsk, sizeof(float));
                        var vh = Avx2.GatherMaskVector256(vfone, lp, Avx2.Add(vi, vione), vgmsk, sizeof(float));

                        vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vfi));
                        vf = Avx.Multiply(vf, va);

                        Avx.Store(ip, vf);
                        ip += Vector256 <float> .Count;
                    }
                    ipe += Vector256 <float> .Count;
                }
#endif
                {
                    var   vlmax = new Vector4(lutmax);
                    var   vzero = Vector4.Zero;
                    float famin = new Vector4(1 / 1024f).X;

                    while (ip < ipe)
                    {
                        var vf = Unsafe.ReadUnaligned <Vector4>(ip);

                        float f3 = vf.W;
                        if (f3 < famin)
                        {
                            Unsafe.WriteUnaligned(ip, vzero);
                        }
                        else
                        {
                            vf = (vf * vlmax / f3).Clamp(vzero, vlmax);

                            float f0 = vf.X;
                            float f1 = vf.Y;
                            float f2 = vf.Z;

                            uint i0 = (uint)f0;
                            uint i1 = (uint)f1;
                            uint i2 = (uint)f2;

                            ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0) * f3;
                            ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1) * f3;
                            ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2) * f3;
                        }
                        ip += 4;
                    }
                }
            }
コード例 #6
0
            unsafe public static void ConvertFloat(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb)
            {
                Debug.Assert(ipstart == opstart);

                float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb);
                float *lp = lutstart;

#if HWINTRINSICS
                if (Avx2.IsSupported)
                {
                    var vlmax = Vector256.Create((float)lutmax);
                    var vzero = Vector256 <float> .Zero;
                    var vione = Vector256.Create(1);

                    ipe -= Vector256 <float> .Count;
                    while (ip <= ipe)
                    {
                        var vf = Avx.Multiply(vlmax, Avx.LoadVector256(ip));
                        vf = Avx.Min(Avx.Max(vzero, vf), vlmax);

                        var vi = Avx.ConvertToVector256Int32WithTruncation(vf);
                        var vp = Avx.ConvertToVector256Single(vi);

                        var vl = Avx2.GatherVector256(lp, vi, sizeof(float));
                        var vh = Avx2.GatherVector256(lp, Avx2.Add(vi, vione), sizeof(float));

                        vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vp));

                        Avx.Store(ip, vf);
                        ip += Vector256 <float> .Count;
                    }
                    ipe += Vector256 <float> .Count;

                    float fmin = vzero.ToScalar(), flmax = vlmax.ToScalar();
                    while (ip < ipe)
                    {
                        float f = (*ip * flmax).Clamp(fmin, flmax);
                        uint  i = (uint)f;

                        *ip++ = Lerp(lp[i], lp[i + 1], f - i);
                    }
                }
                else
#endif
                {
                    var vlmax = new Vector4(lutmax);
                    var vzero = Vector4.Zero;

                    ipe -= 4;
                    while (ip <= ipe)
                    {
                        var vf = (Unsafe.ReadUnaligned <Vector4>(ip) * vlmax).Clamp(vzero, vlmax);

                        float f0 = vf.X;
                        float f1 = vf.Y;
                        float f2 = vf.Z;
                        float f3 = vf.W;

                        uint i0 = (uint)f0;
                        uint i1 = (uint)f1;
                        uint i2 = (uint)f2;
                        uint i3 = (uint)f3;

                        ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0);
                        ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1);
                        ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2);
                        ip[3] = Lerp(lp[i3], lp[i3 + 1], f3 - (int)i3);

                        ip += 4;
                    }
                    ipe += 4;

                    float fmin = vzero.X, flmax = vlmax.X;
                    while (ip < ipe)
                    {
                        float f = (*ip * flmax).Clamp(fmin, flmax);
                        uint  i = (uint)f;

                        *ip++ = Lerp(lp[i], lp[i + 1], f - i);
                    }
                }
            }
コード例 #7
0
            unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb)
            {
                float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb);
                byte * op = opstart;

#if HWINTRINSICS
                if (Avx2.IsSupported)
                {
                    var vzero  = Vector256 <float> .Zero;
                    var vmin   = Vector256.Create(0.5f / byte.MaxValue);
                    var vscale = Vector256.Create((float)byte.MaxValue);

                    var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMaskDeinterleave8x32)));

                    ipe -= Vector256 <byte> .Count;
                    while (ip <= ipe)
                    {
                        var vf0 = Avx.LoadVector256(ip);
                        var vf1 = Avx.LoadVector256(ip + Vector256 <float> .Count);
                        var vf2 = Avx.LoadVector256(ip + Vector256 <float> .Count * 2);
                        var vf3 = Avx.LoadVector256(ip + Vector256 <float> .Count * 3);
                        ip += Vector256 <byte> .Count;

                        var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                        vfa0 = Avx.Max(vfa0, vmin);
                        vfa1 = Avx.Max(vfa1, vmin);
                        vfa2 = Avx.Max(vfa2, vmin);
                        vfa3 = Avx.Max(vfa3, vmin);

                        vf0 = Avx.Multiply(vf0, Avx.Reciprocal(vfa0));
                        vf1 = Avx.Multiply(vf1, Avx.Reciprocal(vfa1));
                        vf2 = Avx.Multiply(vf2, Avx.Reciprocal(vfa2));
                        vf3 = Avx.Multiply(vf3, Avx.Reciprocal(vfa3));

                        vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                        vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                        vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                        vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                        vf0 = Avx.BlendVariable(vf0, vzero, HWIntrinsics.AvxCompareEqual(vfa0, vmin));
                        vf1 = Avx.BlendVariable(vf1, vzero, HWIntrinsics.AvxCompareEqual(vfa1, vmin));
                        vf2 = Avx.BlendVariable(vf2, vzero, HWIntrinsics.AvxCompareEqual(vfa2, vmin));
                        vf3 = Avx.BlendVariable(vf3, vzero, HWIntrinsics.AvxCompareEqual(vfa3, vmin));

                        vf0 = Avx.Multiply(vf0, vscale);
                        vf1 = Avx.Multiply(vf1, vscale);
                        vf2 = Avx.Multiply(vf2, vscale);
                        vf3 = Avx.Multiply(vf3, vscale);

                        var vi0 = Avx.ConvertToVector256Int32(vf0);
                        var vi1 = Avx.ConvertToVector256Int32(vf1);
                        var vi2 = Avx.ConvertToVector256Int32(vf2);
                        var vi3 = Avx.ConvertToVector256Int32(vf3);

                        var vs0 = Avx2.PackSignedSaturate(vi0, vi1);
                        var vs1 = Avx2.PackSignedSaturate(vi2, vi3);

                        var vb0 = Avx2.PackUnsignedSaturate(vs0, vs1);
                        vb0 = Avx2.PermuteVar8x32(vb0.AsInt32(), vmaskp).AsByte();

                        Avx.Store(op, vb0);
                        op += Vector256 <byte> .Count;
                    }
                    ipe += Vector256 <byte> .Count;
                }
                else if (Sse41.IsSupported)
                {
                    var vzero  = Vector128 <float> .Zero;
                    var vmin   = Vector128.Create(0.5f / byte.MaxValue);
                    var vscale = Vector128.Create((float)byte.MaxValue);

                    ipe -= Vector128 <byte> .Count;
                    while (ip <= ipe)
                    {
                        var vf0 = Sse.LoadVector128(ip);
                        var vf1 = Sse.LoadVector128(ip + Vector128 <float> .Count);
                        var vf2 = Sse.LoadVector128(ip + Vector128 <float> .Count * 2);
                        var vf3 = Sse.LoadVector128(ip + Vector128 <float> .Count * 3);
                        ip += Vector128 <byte> .Count;

                        var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha);
                        var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha);

                        vfa0 = Sse.Max(vfa0, vmin);
                        vfa1 = Sse.Max(vfa1, vmin);
                        vfa2 = Sse.Max(vfa2, vmin);
                        vfa3 = Sse.Max(vfa3, vmin);

                        vf0 = Sse.Multiply(vf0, Sse.Reciprocal(vfa0));
                        vf1 = Sse.Multiply(vf1, Sse.Reciprocal(vfa1));
                        vf2 = Sse.Multiply(vf2, Sse.Reciprocal(vfa2));
                        vf3 = Sse.Multiply(vf3, Sse.Reciprocal(vfa3));

                        vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha);
                        vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha);
                        vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha);
                        vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha);

                        vf0 = Sse41.BlendVariable(vf0, vzero, Sse.CompareEqual(vfa0, vmin));
                        vf1 = Sse41.BlendVariable(vf1, vzero, Sse.CompareEqual(vfa1, vmin));
                        vf2 = Sse41.BlendVariable(vf2, vzero, Sse.CompareEqual(vfa2, vmin));
                        vf3 = Sse41.BlendVariable(vf3, vzero, Sse.CompareEqual(vfa3, vmin));

                        vf0 = Sse.Multiply(vf0, vscale);
                        vf1 = Sse.Multiply(vf1, vscale);
                        vf2 = Sse.Multiply(vf2, vscale);
                        vf3 = Sse.Multiply(vf3, vscale);

                        var vi0 = Sse2.ConvertToVector128Int32(vf0);
                        var vi1 = Sse2.ConvertToVector128Int32(vf1);
                        var vi2 = Sse2.ConvertToVector128Int32(vf2);
                        var vi3 = Sse2.ConvertToVector128Int32(vf3);

                        var vs0 = Sse2.PackSignedSaturate(vi0, vi1);
                        var vs1 = Sse2.PackSignedSaturate(vi2, vi3);

                        var vb0 = Sse2.PackUnsignedSaturate(vs0, vs1);

                        Sse2.Store(op, vb0);
                        op += Vector128 <byte> .Count;
                    }
                    ipe += Vector128 <byte> .Count;
                }
#endif

                float fmax = new Vector4(byte.MaxValue).X, fround = new Vector4(0.5f).X, fmin = fround / fmax;

                while (ip < ipe)
                {
                    float f3 = ip[3];
                    if (f3 < fmin)
                    {
                        *(uint *)op = 0;
                    }
                    else
                    {
                        float f3i = fmax / f3;
                        byte  o0  = ClampToByte((int)(ip[0] * f3i + fround));
                        byte  o1  = ClampToByte((int)(ip[1] * f3i + fround));
                        byte  o2  = ClampToByte((int)(ip[2] * f3i + fround));
                        byte  o3  = ClampToByte((int)(f3 * fmax + fround));
                        op[0] = o0;
                        op[1] = o1;
                        op[2] = o2;
                        op[3] = o3;
                    }

                    ip += 4;
                    op += 4;
                }
            }