예제 #1
0
        protected override void Compile(CodeContext c)
        {
            var dst = c.IntPtr("dst");
            var src = c.IntPtr("src");

            var i = c.IntPtr("i");
            var j = c.IntPtr("j");
            var t = c.IntPtr("t");

            var cZero    = c.Xmm("cZero");
            var cMul255A = c.Xmm("cMul255A");
            var cMul255M = c.Xmm("cMul255M");

            var x0 = c.Xmm("x0");
            var x1 = c.Xmm("x1");
            var y0 = c.Xmm("y0");
            var a0 = c.Xmm("a0");
            var a1 = c.Xmm("a1");

            var smallLoop = c.Label();
            var smallEnd  = c.Label();

            var largeLoop = c.Label();
            var largeEnd  = c.Label();

            var data = c.Label();

            c.SetArgument(0, dst);
            c.SetArgument(1, src);
            c.SetArgument(2, i);

            c.Allocate(dst);
            c.Allocate(src);
            c.Allocate(i);

            // How many pixels have to be processed to make the loop aligned.
            c.Lea(t, Memory.Ptr(data));
            c.Xor(j, j);
            c.Xorps(cZero, cZero);

            c.Sub(j, dst);
            c.Movaps(cMul255A, Memory.Ptr(t, 0));

            c.And(j, 15);
            c.Movaps(cMul255M, Memory.Ptr(t, 16));

            c.Shr(j, 2);
            c.Jz(smallEnd);

            // j = min(i, j).
            c.Cmp(j, i);
            c.Cmovg(j, i);

            // i -= j.
            c.Sub(i, j);

            // Small loop.
            c.Bind(smallLoop);

            c.Pcmpeqb(a0, a0);
            c.Movd(y0, Memory.Ptr(src));

            c.Pxor(a0, y0);
            c.Movd(x0, Memory.Ptr(dst));

            c.Psrlw(a0, 8);
            c.Punpcklbw(x0, cZero);

            c.Pshuflw(a0, a0, AsmJit.Common.Utils.Shuffle(1, 1, 1, 1));
            c.Punpcklbw(y0, cZero);

            c.Pmullw(x0, a0);
            c.Paddsw(x0, cMul255A);
            c.Pmulhuw(x0, cMul255M);

            c.Paddw(x0, y0);
            c.Packuswb(x0, x0);

            c.Movd(Memory.Ptr(dst), x0);

            c.Add(dst, 4);
            c.Add(src, 4);

            c.Dec(j);
            c.Jnz(smallLoop);

            // Second section, prepare for an aligned loop.
            c.Bind(smallEnd);

            c.Test(i, i);
            c.Mov(j, i);
            c.Jz(c.Exit);

            c.And(j, 3);
            c.Shr(i, 2);
            c.Jz(largeEnd);

            // Aligned loop.
            c.Bind(largeLoop);

            c.Movups(y0, Memory.Ptr(src));
            c.Pcmpeqb(a0, a0);
            c.Movaps(x0, Memory.Ptr(dst));

            c.Xorps(a0, y0);
            c.Movaps(x1, x0);

            c.Psrlw(a0, 8);
            c.Punpcklbw(x0, cZero);

            c.Movaps(a1, a0);
            c.Punpcklwd(a0, a0);

            c.Punpckhbw(x1, cZero);
            c.Punpckhwd(a1, a1);

            c.Pshufd(a0, a0, AsmJit.Common.Utils.Shuffle(3, 3, 1, 1));
            c.Pshufd(a1, a1, AsmJit.Common.Utils.Shuffle(3, 3, 1, 1));

            c.Pmullw(x0, a0);
            c.Pmullw(x1, a1);

            c.Paddsw(x0, cMul255A);
            c.Paddsw(x1, cMul255A);

            c.Pmulhuw(x0, cMul255M);
            c.Pmulhuw(x1, cMul255M);

            c.Add(src, 16);
            c.Packuswb(x0, x1);

            c.Paddw(x0, y0);
            c.Movaps(Memory.Ptr(dst), x0);

            c.Add(dst, 16);

            c.Dec(i);
            c.Jnz(largeLoop);

            c.Bind(largeEnd);
            c.Test(j, j);
            c.Jnz(smallLoop);

            // Data
            c.Data(data, 16,
                   Data.Of(0x0080008000800080, 0x0080008000800080),
                   Data.Of(0x0101010101010101, 0x0080008000800080));
        }