Пример #1
0
        unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy)
        {
            float *tp = (float *)tstart, tpe = (float *)(tstart + (uint)cb);
            uint * pmapx   = (uint *)mapxstart;
            uint   kstride = (uint)smapx * channels;
            uint   tstride = (uint)smapy * channels;
            uint   vcnt    = kstride / (uint)VectorSse.Count;

            while (tp < tpe)
            {
                uint ix   = *pmapx++;
                uint lcnt = vcnt;

                float *ip = (float *)istart + ix * channels;
                float *mp = (float *)(mapxstart + *pmapx++);

                VectorSse av0;

                if (Avx.IsSupported && lcnt >= 2)
                {
                    var ax0 = VectorAvx.Zero;

                    for (; lcnt >= 8; lcnt -= 8)
                    {
                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + VectorAvx.Count);
                        var iv2 = Avx.LoadVector256(ip + VectorAvx.Count * 2);
                        var iv3 = Avx.LoadVector256(ip + VectorAvx.Count * 3);
                        ip += VectorAvx.Count * 4;

                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp);
                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv1, mp + VectorAvx.Count);
                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv2, mp + VectorAvx.Count * 2);
                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv3, mp + VectorAvx.Count * 3);
                        mp += VectorAvx.Count * 4;
                    }

                    if (lcnt >= 6)
                    {
                        lcnt -= 6;

                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + VectorAvx.Count);
                        var iv2 = Avx.LoadVector256(ip + VectorAvx.Count * 2);
                        ip += VectorAvx.Count * 3;

                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp);
                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv1, mp + VectorAvx.Count);
                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv2, mp + VectorAvx.Count * 2);
                        mp += VectorAvx.Count * 3;
                    }
                    else if (lcnt >= 4)
                    {
                        lcnt -= 4;

                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + VectorAvx.Count);
                        ip += VectorAvx.Count * 2;

                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp);
                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv1, mp + VectorAvx.Count);
                        mp += VectorAvx.Count * 2;
                    }
                    else if (lcnt >= 2)
                    {
                        lcnt -= 2;

                        var iv0 = Avx.LoadVector256(ip);
                        ip += VectorAvx.Count;

                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp);
                        mp += VectorAvx.Count;
                    }

                    av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper());
                }
                else
                {
                    av0 = VectorSse.Zero;

                    for (; lcnt >= 4; lcnt -= 4)
                    {
                        var iv0 = Sse.LoadVector128(ip);
                        var iv1 = Sse.LoadVector128(ip + VectorSse.Count);
                        var iv2 = Sse.LoadVector128(ip + VectorSse.Count * 2);
                        var iv3 = Sse.LoadVector128(ip + VectorSse.Count * 3);
                        ip += VectorSse.Count * 4;

                        av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp);
                        av0 = HWIntrinsics.MultiplyAdd(av0, iv1, mp + VectorSse.Count);
                        av0 = HWIntrinsics.MultiplyAdd(av0, iv2, mp + VectorSse.Count * 2);
                        av0 = HWIntrinsics.MultiplyAdd(av0, iv3, mp + VectorSse.Count * 3);
                        mp += VectorSse.Count * 4;
                    }

                    if (lcnt >= 2)
                    {
                        lcnt -= 2;

                        var iv0 = Sse.LoadVector128(ip);
                        var iv1 = Sse.LoadVector128(ip + VectorSse.Count);
                        ip += VectorSse.Count * 2;

                        av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp);
                        av0 = HWIntrinsics.MultiplyAdd(av0, iv1, mp + VectorSse.Count);
                        mp += VectorSse.Count * 2;
                    }
                }

                if (lcnt != 0)
                {
                    var iv0 = Sse.LoadVector128(ip);

                    av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp);
                }

                *tp = av0.HorizontalAdd();
                tp += tstride;
            }
        }
Пример #2
0
        unsafe void IConvolver.WriteDestLine(byte *tstart, byte *ostart, int ox, int ow, byte *pmapy, int smapy)
        {
            float *op      = (float *)ostart;
            uint   tstride = (uint)smapy * channels;
            uint   vcnt    = tstride / (uint)VectorSse.Count;

            for (int xc = ox + ow; ox < xc; ox++)
            {
                uint lcnt = vcnt;

                float *tp = (float *)tstart + (uint)ox * tstride;
                float *mp = (float *)pmapy;

                VectorSse av0;

                if (Avx.IsSupported && lcnt >= 2)
                {
                    var ax0 = VectorAvx.Zero;

                    for (; lcnt >= 4; lcnt -= 4)
                    {
                        var iv0 = Avx.LoadVector256(tp);
                        var iv1 = Avx.LoadVector256(tp + VectorAvx.Count);
                        tp += VectorAvx.Count * 2;

                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp);
                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv1, mp + VectorAvx.Count);
                        mp += VectorAvx.Count * 2;
                    }

                    if (lcnt >= 2)
                    {
                        lcnt -= 2;

                        var iv0 = Avx.LoadVector256(tp);
                        tp += VectorAvx.Count;

                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp);
                        mp += VectorAvx.Count;
                    }

                    av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper());
                }
                else
                {
                    av0 = VectorSse.Zero;

                    for (; lcnt >= 2; lcnt -= 2)
                    {
                        var iv0 = Sse.LoadVector128(tp);
                        var iv1 = Sse.LoadVector128(tp + VectorSse.Count);
                        tp += VectorSse.Count * 2;

                        av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp);
                        av0 = HWIntrinsics.MultiplyAdd(av0, iv1, mp + VectorSse.Count);
                        mp += VectorSse.Count * 2;
                    }
                }

                if (lcnt != 0)
                {
                    var iv0 = Sse.LoadVector128(tp);

                    av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp);
                }

                *op++ = av0.HorizontalAdd();
            }
        }
Пример #3
0
        unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy)
        {
            float *tp = (float *)tstart, tpe = (float *)(tstart + (uint)cb);
            uint * pmapx   = (uint *)mapxstart;
            uint   kstride = (uint)smapx * channels;
            uint   tstride = (uint)smapy * 4;
            uint   vcnt    = kstride / (uint)VectorSse.Count;

            while (tp < tpe)
            {
                uint ix   = *pmapx++;
                uint lcnt = vcnt;

                float *ip = (float *)istart + ix * channels;
                float *mp = (float *)(mapxstart + *pmapx++);

                VectorSse av0, av1, av2;

                if (Avx.IsSupported && lcnt >= 6)
                {
                    var ax0 = VectorAvx.Zero;
                    var ax1 = VectorAvx.Zero;
                    var ax2 = VectorAvx.Zero;

                    for (; lcnt >= 6; lcnt -= 6)
                    {
                        var iv0 = Avx.LoadVector256(ip);
                        var iv1 = Avx.LoadVector256(ip + VectorAvx.Count);
                        var iv2 = Avx.LoadVector256(ip + VectorAvx.Count * 2);
                        ip += VectorAvx.Count * 3;

                        ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp);
                        ax1 = HWIntrinsics.MultiplyAdd(ax1, iv1, mp + VectorAvx.Count);
                        ax2 = HWIntrinsics.MultiplyAdd(ax2, iv2, mp + VectorAvx.Count * 2);
                        mp += VectorAvx.Count * 3;
                    }

                    av0 = Sse.Add(ax0.GetLower(), ax1.GetUpper());
                    av1 = Sse.Add(ax0.GetUpper(), ax2.GetLower());
                    av2 = Sse.Add(ax1.GetLower(), ax2.GetUpper());
                }
                else
                {
                    av0 = av1 = av2 = VectorSse.Zero;
                }

                for (; lcnt != 0; lcnt -= 3)
                {
                    var iv0 = Sse.LoadVector128(ip);
                    var iv1 = Sse.LoadVector128(ip + VectorSse.Count);
                    var iv2 = Sse.LoadVector128(ip + VectorSse.Count * 2);
                    ip += VectorSse.Count * 3;

                    av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp);
                    av1 = HWIntrinsics.MultiplyAdd(av1, iv1, mp + VectorSse.Count);
                    av2 = HWIntrinsics.MultiplyAdd(av2, iv2, mp + VectorSse.Count * 2);
                    mp += VectorSse.Count * 3;
                }

                var avs = Sse.Add(Sse.Add(
                                      Sse.Shuffle(av0, av0, 0b_00_10_01_11),
                                      Sse.Shuffle(av1, av1, 0b_00_01_11_10)),
                                  Sse.Shuffle(av2, av2, 0b_00_11_10_01)
                                  );

                av0 = Sse.MoveLowToHigh(Sse.UnpackLow(av0, av1), av2);
                av0 = Sse.Add(av0, avs);

                Sse.Store(tp, av0);
                tp += tstride;
            }
        }