protected override void Compile(CodeContext c) { var dst = c.IntPtr("dst"); var src = c.IntPtr("src"); var i = c.IntPtr("i"); var j = c.IntPtr("j"); var t = c.IntPtr("t"); var cZero = c.Xmm("cZero"); var cMul255A = c.Xmm("cMul255A"); var cMul255M = c.Xmm("cMul255M"); var x0 = c.Xmm("x0"); var x1 = c.Xmm("x1"); var y0 = c.Xmm("y0"); var a0 = c.Xmm("a0"); var a1 = c.Xmm("a1"); var smallLoop = c.Label(); var smallEnd = c.Label(); var largeLoop = c.Label(); var largeEnd = c.Label(); var data = c.Label(); c.SetArgument(0, dst); c.SetArgument(1, src); c.SetArgument(2, i); c.Allocate(dst); c.Allocate(src); c.Allocate(i); // How many pixels have to be processed to make the loop aligned. c.Lea(t, Memory.Ptr(data)); c.Xor(j, j); c.Xorps(cZero, cZero); c.Sub(j, dst); c.Movaps(cMul255A, Memory.Ptr(t, 0)); c.And(j, 15); c.Movaps(cMul255M, Memory.Ptr(t, 16)); c.Shr(j, 2); c.Jz(smallEnd); // j = min(i, j). c.Cmp(j, i); c.Cmovg(j, i); // i -= j. c.Sub(i, j); // Small loop. c.Bind(smallLoop); c.Pcmpeqb(a0, a0); c.Movd(y0, Memory.Ptr(src)); c.Pxor(a0, y0); c.Movd(x0, Memory.Ptr(dst)); c.Psrlw(a0, 8); c.Punpcklbw(x0, cZero); c.Pshuflw(a0, a0, AsmJit.Common.Utils.Shuffle(1, 1, 1, 1)); c.Punpcklbw(y0, cZero); c.Pmullw(x0, a0); c.Paddsw(x0, cMul255A); c.Pmulhuw(x0, cMul255M); c.Paddw(x0, y0); c.Packuswb(x0, x0); c.Movd(Memory.Ptr(dst), x0); c.Add(dst, 4); c.Add(src, 4); c.Dec(j); c.Jnz(smallLoop); // Second section, prepare for an aligned loop. c.Bind(smallEnd); c.Test(i, i); c.Mov(j, i); c.Jz(c.Exit); c.And(j, 3); c.Shr(i, 2); c.Jz(largeEnd); // Aligned loop. c.Bind(largeLoop); c.Movups(y0, Memory.Ptr(src)); c.Pcmpeqb(a0, a0); c.Movaps(x0, Memory.Ptr(dst)); c.Xorps(a0, y0); c.Movaps(x1, x0); c.Psrlw(a0, 8); c.Punpcklbw(x0, cZero); c.Movaps(a1, a0); c.Punpcklwd(a0, a0); c.Punpckhbw(x1, cZero); c.Punpckhwd(a1, a1); c.Pshufd(a0, a0, AsmJit.Common.Utils.Shuffle(3, 3, 1, 1)); c.Pshufd(a1, a1, AsmJit.Common.Utils.Shuffle(3, 3, 1, 1)); c.Pmullw(x0, a0); c.Pmullw(x1, a1); c.Paddsw(x0, cMul255A); c.Paddsw(x1, cMul255A); c.Pmulhuw(x0, cMul255M); c.Pmulhuw(x1, cMul255M); c.Add(src, 16); c.Packuswb(x0, x1); c.Paddw(x0, y0); c.Movaps(Memory.Ptr(dst), x0); c.Add(dst, 16); c.Dec(i); c.Jnz(largeLoop); c.Bind(largeEnd); c.Test(j, j); c.Jnz(smallLoop); // Data c.Data(data, 16, Data.Of(0x0080008000800080, 0x0080008000800080), Data.Of(0x0101010101010101, 0x0080008000800080)); }