unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy) { float *tp = (float *)tstart, tpe = (float *)(tstart + (uint)cb); uint * pmapx = (uint *)mapxstart; uint kstride = (uint)smapx * channels; uint tstride = (uint)smapy * channels; uint vcnt = kstride / (uint)VectorSse.Count; while (tp < tpe) { uint ix = *pmapx++; uint lcnt = vcnt; float *ip = (float *)istart + ix * channels; float *mp = (float *)(mapxstart + *pmapx++); VectorSse av0; if (Avx.IsSupported && lcnt >= 2) { var ax0 = VectorAvx.Zero; for (; lcnt >= 8; lcnt -= 8) { var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + VectorAvx.Count); var iv2 = Avx.LoadVector256(ip + VectorAvx.Count * 2); var iv3 = Avx.LoadVector256(ip + VectorAvx.Count * 3); ip += VectorAvx.Count * 4; ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp); ax0 = HWIntrinsics.MultiplyAdd(ax0, iv1, mp + VectorAvx.Count); ax0 = HWIntrinsics.MultiplyAdd(ax0, iv2, mp + VectorAvx.Count * 2); ax0 = HWIntrinsics.MultiplyAdd(ax0, iv3, mp + VectorAvx.Count * 3); mp += VectorAvx.Count * 4; } if (lcnt >= 6) { lcnt -= 6; var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + VectorAvx.Count); var iv2 = Avx.LoadVector256(ip + VectorAvx.Count * 2); ip += VectorAvx.Count * 3; ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp); ax0 = HWIntrinsics.MultiplyAdd(ax0, iv1, mp + VectorAvx.Count); ax0 = HWIntrinsics.MultiplyAdd(ax0, iv2, mp + VectorAvx.Count * 2); mp += VectorAvx.Count * 3; } else if (lcnt >= 4) { lcnt -= 4; var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + VectorAvx.Count); ip += VectorAvx.Count * 2; ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp); ax0 = HWIntrinsics.MultiplyAdd(ax0, iv1, mp + VectorAvx.Count); mp += VectorAvx.Count * 2; } else if (lcnt >= 2) { lcnt -= 2; var iv0 = Avx.LoadVector256(ip); ip += VectorAvx.Count; ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp); mp += VectorAvx.Count; } av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper()); } else { av0 = VectorSse.Zero; for (; lcnt >= 4; lcnt -= 4) { var iv0 = Sse.LoadVector128(ip); var iv1 = Sse.LoadVector128(ip + VectorSse.Count); var iv2 = Sse.LoadVector128(ip + VectorSse.Count * 2); var iv3 = Sse.LoadVector128(ip + VectorSse.Count * 3); ip += VectorSse.Count * 4; av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp); av0 = HWIntrinsics.MultiplyAdd(av0, iv1, mp + VectorSse.Count); av0 = HWIntrinsics.MultiplyAdd(av0, iv2, mp + VectorSse.Count * 2); av0 = HWIntrinsics.MultiplyAdd(av0, iv3, mp + VectorSse.Count * 3); mp += VectorSse.Count * 4; } if (lcnt >= 2) { lcnt -= 2; var iv0 = Sse.LoadVector128(ip); var iv1 = Sse.LoadVector128(ip + VectorSse.Count); ip += VectorSse.Count * 2; av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp); av0 = HWIntrinsics.MultiplyAdd(av0, iv1, mp + VectorSse.Count); mp += VectorSse.Count * 2; } } if (lcnt != 0) { var iv0 = Sse.LoadVector128(ip); av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp); } *tp = av0.HorizontalAdd(); tp += tstride; } }
unsafe void IConvolver.WriteDestLine(byte *tstart, byte *ostart, int ox, int ow, byte *pmapy, int smapy) { float *op = (float *)ostart; uint tstride = (uint)smapy * channels; uint vcnt = tstride / (uint)VectorSse.Count; for (int xc = ox + ow; ox < xc; ox++) { uint lcnt = vcnt; float *tp = (float *)tstart + (uint)ox * tstride; float *mp = (float *)pmapy; VectorSse av0; if (Avx.IsSupported && lcnt >= 2) { var ax0 = VectorAvx.Zero; for (; lcnt >= 4; lcnt -= 4) { var iv0 = Avx.LoadVector256(tp); var iv1 = Avx.LoadVector256(tp + VectorAvx.Count); tp += VectorAvx.Count * 2; ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp); ax0 = HWIntrinsics.MultiplyAdd(ax0, iv1, mp + VectorAvx.Count); mp += VectorAvx.Count * 2; } if (lcnt >= 2) { lcnt -= 2; var iv0 = Avx.LoadVector256(tp); tp += VectorAvx.Count; ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp); mp += VectorAvx.Count; } av0 = Sse.Add(ax0.GetLower(), ax0.GetUpper()); } else { av0 = VectorSse.Zero; for (; lcnt >= 2; lcnt -= 2) { var iv0 = Sse.LoadVector128(tp); var iv1 = Sse.LoadVector128(tp + VectorSse.Count); tp += VectorSse.Count * 2; av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp); av0 = HWIntrinsics.MultiplyAdd(av0, iv1, mp + VectorSse.Count); mp += VectorSse.Count * 2; } } if (lcnt != 0) { var iv0 = Sse.LoadVector128(tp); av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp); } *op++ = av0.HorizontalAdd(); } }
unsafe void IConvolver.ConvolveSourceLine(byte *istart, byte *tstart, int cb, byte *mapxstart, int smapx, int smapy) { float *tp = (float *)tstart, tpe = (float *)(tstart + (uint)cb); uint * pmapx = (uint *)mapxstart; uint kstride = (uint)smapx * channels; uint tstride = (uint)smapy * 4; uint vcnt = kstride / (uint)VectorSse.Count; while (tp < tpe) { uint ix = *pmapx++; uint lcnt = vcnt; float *ip = (float *)istart + ix * channels; float *mp = (float *)(mapxstart + *pmapx++); VectorSse av0, av1, av2; if (Avx.IsSupported && lcnt >= 6) { var ax0 = VectorAvx.Zero; var ax1 = VectorAvx.Zero; var ax2 = VectorAvx.Zero; for (; lcnt >= 6; lcnt -= 6) { var iv0 = Avx.LoadVector256(ip); var iv1 = Avx.LoadVector256(ip + VectorAvx.Count); var iv2 = Avx.LoadVector256(ip + VectorAvx.Count * 2); ip += VectorAvx.Count * 3; ax0 = HWIntrinsics.MultiplyAdd(ax0, iv0, mp); ax1 = HWIntrinsics.MultiplyAdd(ax1, iv1, mp + VectorAvx.Count); ax2 = HWIntrinsics.MultiplyAdd(ax2, iv2, mp + VectorAvx.Count * 2); mp += VectorAvx.Count * 3; } av0 = Sse.Add(ax0.GetLower(), ax1.GetUpper()); av1 = Sse.Add(ax0.GetUpper(), ax2.GetLower()); av2 = Sse.Add(ax1.GetLower(), ax2.GetUpper()); } else { av0 = av1 = av2 = VectorSse.Zero; } for (; lcnt != 0; lcnt -= 3) { var iv0 = Sse.LoadVector128(ip); var iv1 = Sse.LoadVector128(ip + VectorSse.Count); var iv2 = Sse.LoadVector128(ip + VectorSse.Count * 2); ip += VectorSse.Count * 3; av0 = HWIntrinsics.MultiplyAdd(av0, iv0, mp); av1 = HWIntrinsics.MultiplyAdd(av1, iv1, mp + VectorSse.Count); av2 = HWIntrinsics.MultiplyAdd(av2, iv2, mp + VectorSse.Count * 2); mp += VectorSse.Count * 3; } var avs = Sse.Add(Sse.Add( Sse.Shuffle(av0, av0, 0b_00_10_01_11), Sse.Shuffle(av1, av1, 0b_00_01_11_10)), Sse.Shuffle(av2, av2, 0b_00_11_10_01) ); av0 = Sse.MoveLowToHigh(Sse.UnpackLow(av0, av1), av2); av0 = Sse.Add(av0, avs); Sse.Store(tp, av0); tp += tstride; } }