//↑をマルチスレッド化 //Intrinsics FMA MultiplyAdd float private unsafe long Test13_Intrinsics_FMA_MultiplyAdd_float_MT(byte[] vs) { long total = 0; int simdLength = Vector256 <int> .Count; int rangeSize = vs.Length / Environment.ProcessorCount;//1区分のサイズ Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize), (range) => { long subtotal = 0; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector256 <float> ff = Vector256.Create(0f); fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector256 <int> v = Avx2.ConvertToVector256Int32(p + i); Vector256 <float> f = Avx.ConvertToVector256Single(v); ff = Fma.MultiplyAdd(f, f, ff); //float } } float *pp = stackalloc float[Vector256 <float> .Count]; Avx.Store(pp, ff); for (int i = 0; i < Vector256 <float> .Count; i++) { subtotal += (long)pp[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
public void RunFldScenario() { var result = Avx.ConvertToVector256Single(_fld); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld, _dataTable.outArrayPtr); }
private unsafe void Test3_Vector256Float(byte[] x, byte[] y, byte[] z, byte[] xx, byte[] yy, byte[] zz, float[] result) { Parallel.ForEach(Partitioner.Create(0, x.Length), range => { int simdLength = Vector256 <float> .Count; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector256 <float> vx, vy, vz, vm; fixed(byte *px = x, py = y, pz = z, pxx = xx, pyy = yy, pzz = zz) { fixed(float *dp = result) { for (int i = range.Item1; i < range.Item2; i += simdLength) { vx = Avx.Subtract( Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(px + i)), Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pxx + i))); vy = Avx.Subtract( Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(py + i)), Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pyy + i))); vz = Avx.Subtract( Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pz + i)), Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pzz + i))); vm = Avx.Add(Avx.Multiply(vx, vx), Avx.Multiply(vy, vy)); vm = Avx.Sqrt(Avx.Add(vm, Avx.Multiply(vz, vz))); Avx.Store(dp + i, vm); } } } }); }
//誤差無しで計算できる最大要素数は2064まで。 //これはVector256<float>でbyte型配列を計算する場合で、 //floatの誤差なし最大値が16777215(24bit)とbyte配列が最大の255ってことで //16777215/255/255=258.01176 //小数点以下切り捨てて258個、これにVectorCountの8をかけて //258*8=2064、これが限界。 //あとはおまけでVectorCountで割り切れなかった余りの最大数7を足して //2064+7=2071 //FMA MultiplyAddはVector256Double型でも計算できる //最大要素数は増えるけどVectorCountが半減するから遅くなるので //配列を分割してfloat型で計算するほうが効率が良さそう //Intrinsics FMA MultiplyAdd float private unsafe long Test3_Intrinsics_FMA_MultiplyAdd_float(byte[] vs) { long total = 0; int simdLength = Vector256 <int> .Count; int lastIndex = vs.Length - (vs.Length % simdLength); Vector256 <float> ff = Vector256.Create(0f); fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { Vector256 <int> v = Avx2.ConvertToVector256Int32(p + i); Vector256 <float> f = Avx.ConvertToVector256Single(v); ff = Fma.MultiplyAdd(f, f, ff);//float } } float *pp = stackalloc float[Vector256 <float> .Count]; Avx.Store(pp, ff); for (int i = 0; i < Vector256 <float> .Count; i++) { total += (long)pp[i]; } //割り切れなかった余り要素用 for (int i = lastIndex; i < vs.Length; i++) { total += vs[i] * vs[i]; } return(total); }
//変数を外で private unsafe void Test46_Intrinsics_V256float_Sqrt(byte[] red, byte[] green, byte[] blue, float[] vv) { int simdLength = Vector256 <float> .Count; int lastIndex = red.Length - (red.Length % simdLength); float *tp = stackalloc float[simdLength]; //var zero = Vector256<float>.Zero; var vm = Vector256 <float> .Zero; Vector256 <float> vr; Vector256 <float> vg; Vector256 <float> vb; fixed(byte *pR = red, pG = green, pB = blue) { for (int i = 0; i < lastIndex; i += simdLength) { vr = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pR + i)); vg = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pG + i)); vb = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pB + i)); vr = Avx.Subtract(vg, vr); vg = Avx.Subtract(vb, vg); vb = Avx.Subtract(vr, vb); vm = Avx.Add(Avx.Multiply(vr, vr), Avx.Multiply(vg, vg)); vm = Avx.Add(vm, Avx.Multiply(vb, vb)); vm = Avx.Sqrt(vm); Avx.Store(tp, vm); for (int m = 0; m < simdLength; m++) { vv[i + m] = tp[m]; } } } Amari(lastIndex, red.Length, red, green, blue, vv); }
public void RunLclVarScenario_UnsafeRead() { var firstOp = Unsafe.Read <Vector256 <Int32> >(_dataTable.inArrayPtr); var result = Avx.ConvertToVector256Single(firstOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { var firstOp = Avx.LoadAlignedVector256((Int32 *)(_dataTable.inArrayPtr)); var result = Avx.ConvertToVector256Single(firstOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, _dataTable.outArrayPtr); }
public void RunLclFldScenario() { var test = new SimpleUnaryOpTest__ConvertToVector256SingleInt32(); var result = Avx.ConvertToVector256Single(test._fld); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Avx.ConvertToVector256Single( Unsafe.Read <Vector256 <Int32> >(_dataTable.inArrayPtr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Avx.ConvertToVector256Single( Avx.LoadAlignedVector256((Int32 *)(_dataTable.inArrayPtr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
private unsafe void TestAddSum(byte[] vs) { fixed(byte *p = vs) { var v = Avx.LoadVector256(p); var v2 = Avx.LoadVector256(p + 32); //Avx.MultipleSumAbsoluteDifferences; Vector256 <int> i1 = Avx2.ConvertToVector256Int32(p); Vector256 <float> f1 = Avx.ConvertToVector256Single(i1); Vector256 <float> m1 = Avx.Multiply(f1, f1); Vector128 <int> i128 = Sse41.ConvertToVector128Int32(p); Vector256 <double> d256 = Avx.ConvertToVector256Double(i128); var dZero = Vector256 <double> .Zero; Vector256 <double> ma1 = Fma.MultiplyAdd(d256, d256, dZero); var i256 = Avx2.ConvertToVector256Int32(p); var f256 = Avx.ConvertToVector256Single(i256); var fZero = Vector256 <float> .Zero; var ma2 = Fma.MultiplyAdd(f256, f256, fZero); Vector128 <float> s128 = Sse2.ConvertToVector128Single(i128); Vector128 <float> ms = Sse.MultiplyScalar(s128, s128); // x86 / x64 SIMD命令一覧表(SSE~AVX2) //https://www.officedaytime.com/tips/simd.html // pmaddwd //https://www.officedaytime.com/tips/simdimg/si.php?f=pmaddwd Vector128 <short> sh128 = Sse41.ConvertToVector128Int16(p); Vector128 <int> vv3 = Avx.MultiplyAddAdjacent(sh128, sh128); var neko = 0; //Avx.MultiplyAddAdjacent; //Avx.MultiplyHigh; //Avx.MultiplyHighRoundScale; //Avx.MultiplyLow; //Avx.MultiplyScalar; //Fma.MultiplyAdd; //Fma.MultiplyAddNegated; //Fma.MultiplyAddNegatedScalar; //Fma.MultiplyAddScalar; //Fma.MultiplyAddSubtract; //Fma.MultiplySubtract; //Fma.MultiplySubtractAdd; //Fma.MultiplySubtractNegated; //Fma.MultiplySubtractNegatedScalar; //Fma.MultiplySubtractScalar; } }
//↑を改変 //集計用のVector256<float>で誤差が出ないように配列を分割して計算 //Intrinsics FMA MultiplyAdd float private unsafe long Test23_Intrinsics_FMA_MultiplyAdd_float_MT_Kai(byte[] vs) { long total = 0; int simdLength = Vector256 <int> .Count; //集計用のVector256<float>で扱える最大要素数 = 2064 //これを1区分あたりの要素数(分割サイズ)にする //floatの仮数部24bit(16777215) * 8 / (255 * 255) = 2064.0941 int rangeSize = ((1 << 24) - 1) * Vector256 <float> .Count / (byte.MaxValue * byte.MaxValue); Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize), (range) => { long subtotal = 0; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector256 <float> vTotal = Vector256.Create(0f); //集計用 fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector256 <int> v = Avx2.ConvertToVector256Int32(p + i); Vector256 <float> f = Avx.ConvertToVector256Single(v); vTotal = Fma.MultiplyAdd(f, f, vTotal); //float } } float *pp = stackalloc float[Vector256 <float> .Count]; Avx.Store(pp, vTotal); for (int i = 0; i < Vector256 <float> .Count; i++) { subtotal += (long)pp[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
//floatで掛け算、足し算 //これだと要素数10万程度でも誤差が出てくる private unsafe double Test6Variance(byte[] vs) { int simdLength = Vector256 <int> .Count; int i; var vTotal = Vector256 <float> .Zero; fixed(byte *p = vs) { for (i = 0; i < vs.Length; i += simdLength) { Vector256 <int> v = Avx2.ConvertToVector256Int32(p + i);//01234567 Vector256 <float> inu = Avx.ConvertToVector256Single(v); Vector256 <float> vv = Avx.Multiply(inu, inu); vTotal = Avx.Add(vTotal, vv); //var neko = Avx.ConvertToVector256Int32(vv); //vTotal = Fma.MultiplyAdd(vv, vv, vTotal); } } double total = 0; simdLength = Vector256 <float> .Count; float *temp = stackalloc float[simdLength]; Avx.Store(temp, vTotal); for (int j = 0; j < simdLength; j++) { total += temp[j]; } for (; i < vs.Length; i++) { total += vs[i]; } double average = (double)Test2(vs) / vs.Length; return((total / vs.Length) - (average * average)); }
private unsafe void Test47_Intrinsics_V256float_Sqrt_MT(byte[] red, byte[] green, byte[] blue, float[] vv) { int rangeSize = red.Length / Environment.ProcessorCount; int simdLength = Vector256 <float> .Count; Parallel.ForEach(Partitioner.Create(0, red.Length, rangeSize), (range) => { float *tp = stackalloc float[simdLength]; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; var vm = Vector256 <float> .Zero; Vector256 <float> vr; Vector256 <float> vg; Vector256 <float> vb; fixed(byte *pR = red, pG = green, pB = blue) { for (int i = range.Item1; i < lastIndex; i += simdLength) { vr = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pR + i)); vg = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pG + i)); vb = Avx.ConvertToVector256Single(Avx2.ConvertToVector256Int32(pB + i)); vr = Avx.Subtract(vg, vr); vg = Avx.Subtract(vb, vg); vb = Avx.Subtract(vr, vb); vm = Avx.Add(Avx.Multiply(vr, vr), Avx.Multiply(vg, vg)); vm = Avx.Add(vm, Avx.Multiply(vb, vb)); vm = Avx.Sqrt(vm); Avx.Store(tp, vm); for (int m = 0; m < simdLength; m++) { vv[i + m] = tp[m]; } } Amari(lastIndex, range.Item2, red, green, blue, vv); } }); }
// Convert public static f32 Converti32_f32(i32 a) { return(Avx.ConvertToVector256Single(a)); }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &LookupTables.Alpha[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscale = Vector256.Create(1f / byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMask3To3xChan))); ipe -= Vector256 <byte> .Count * 3 / 4 + 2; // +2 accounts for the overrun on the last read while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3 / 4); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 6 / 4); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 9 / 4); ip += Vector256 <byte> .Count * 3 / 4; vi0 = Avx2.PermuteVar8x32(vi0, vmaskp); vi1 = Avx2.PermuteVar8x32(vi1, vmaskp); vi2 = Avx2.PermuteVar8x32(vi2, vmaskp); vi3 = Avx2.PermuteVar8x32(vi3, vmaskp); var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); Avx.Store(op, vf0); Avx.Store(op + Vector256 <float> .Count, vf1); Avx.Store(op + Vector256 <float> .Count * 2, vf2); Avx.Store(op + Vector256 <float> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count * 3 / 4 + 2; } else if (Sse41.IsSupported) { var vscale = Vector128.Create(1f / byte.MaxValue); ipe -= Vector128 <byte> .Count * 3 / 4 + 1; // +1 accounts for the overrun on the last read while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3 / 4); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 6 / 4); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 9 / 4); ip += Vector128 <byte> .Count * 3 / 4; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); Sse.Store(op, vf0); Sse.Store(op + Vector128 <float> .Count, vf1); Sse.Store(op + Vector128 <float> .Count * 2, vf2); Sse.Store(op + Vector128 <float> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count * 3 / 4 + 1; } #endif while (ip < ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; ip += 3; op[0] = o0; op[1] = o1; op[2] = o2; op += 4; } } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &valueTable[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscal = Vector256.Create(scale); var voffs = Fma.IsSupported ? Vector256.Create(offset * scale) : Vector256.Create(offset); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3); ip += Vector256 <byte> .Count; var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); if (Fma.IsSupported) { vf0 = Fma.MultiplySubtract(vf0, vscal, voffs); vf1 = Fma.MultiplySubtract(vf1, vscal, voffs); vf2 = Fma.MultiplySubtract(vf2, vscal, voffs); vf3 = Fma.MultiplySubtract(vf3, vscal, voffs); } else { vf0 = Avx.Multiply(Avx.Subtract(vf0, voffs), vscal); vf1 = Avx.Multiply(Avx.Subtract(vf1, voffs), vscal); vf2 = Avx.Multiply(Avx.Subtract(vf2, voffs), vscal); vf3 = Avx.Multiply(Avx.Subtract(vf3, voffs), vscal); } Avx.Store(op, vf0); Avx.Store(op + Vector256 <int> .Count, vf1); Avx.Store(op + Vector256 <int> .Count * 2, vf2); Avx.Store(op + Vector256 <int> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vscal = Vector128.Create(scale); var voffs = Vector128.Create(offset); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3); ip += Vector128 <byte> .Count; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(Sse.Subtract(vf0, voffs), vscal); vf1 = Sse.Multiply(Sse.Subtract(vf1, voffs), vscal); vf2 = Sse.Multiply(Sse.Subtract(vf2, voffs), vscal); vf3 = Sse.Multiply(Sse.Subtract(vf3, voffs), vscal); Sse.Store(op, vf0); Sse.Store(op + Vector128 <int> .Count, vf1); Sse.Store(op + Vector128 <int> .Count * 2, vf2); Sse.Store(op + Vector128 <int> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #elif VECTOR_CONVERT var vscal = new VectorF(scale); var voffs = new VectorF(offset); ipe -= Vector <byte> .Count; while (ip <= ipe) { var vb = Unsafe.ReadUnaligned <Vector <byte> >(ip); Vector.Widen(vb, out var vs0, out var vs1); Vector.Widen(vs0, out var vi0, out var vi1); Vector.Widen(vs1, out var vi2, out var vi3); ip += Vector <byte> .Count; var vf0 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi0)); var vf1 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi1)); var vf2 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi2)); var vf3 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi3)); vf0 = (vf0 - voffs) * vscal; vf1 = (vf1 - voffs) * vscal; vf2 = (vf2 - voffs) * vscal; vf3 = (vf3 - voffs) * vscal; Unsafe.WriteUnaligned(op, vf0); Unsafe.WriteUnaligned(op + VectorF.Count, vf1); Unsafe.WriteUnaligned(op + VectorF.Count * 2, vf2); Unsafe.WriteUnaligned(op + VectorF.Count * 3, vf3); op += Vector <byte> .Count; } ipe += Vector <byte> .Count; #endif ipe -= 8; while (ip <= ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; float o3 = at[(uint)ip[3]]; float o4 = at[(uint)ip[4]]; float o5 = at[(uint)ip[5]]; float o6 = at[(uint)ip[6]]; float o7 = at[(uint)ip[7]]; ip += 8; op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; op[4] = o4; op[5] = o5; op[6] = o6; op[7] = o7; op += 8; } ipe += 8; while (ip < ipe) { op[0] = at[(uint)ip[0]]; ip++; op++; } } }
unsafe public static void ConvertFloat(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb) { Debug.Assert(ipstart == opstart); float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); float *lp = lutstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vlmax = Vector256.Create((float)lutmax); var vzero = Vector256 <float> .Zero; var vione = Vector256.Create(1); ipe -= Vector256 <float> .Count; while (ip <= ipe) { var vf = Avx.Multiply(vlmax, Avx.LoadVector256(ip)); vf = Avx.Min(Avx.Max(vzero, vf), vlmax); var vi = Avx.ConvertToVector256Int32WithTruncation(vf); var vp = Avx.ConvertToVector256Single(vi); var vl = Avx2.GatherVector256(lp, vi, sizeof(float)); var vh = Avx2.GatherVector256(lp, Avx2.Add(vi, vione), sizeof(float)); vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vp)); Avx.Store(ip, vf); ip += Vector256 <float> .Count; } ipe += Vector256 <float> .Count; float fmin = vzero.ToScalar(), flmax = vlmax.ToScalar(); while (ip < ipe) { float f = (*ip * flmax).Clamp(fmin, flmax); uint i = (uint)f; *ip++ = Lerp(lp[i], lp[i + 1], f - i); } } else #endif { var vlmax = new Vector4(lutmax); var vzero = Vector4.Zero; ipe -= 4; while (ip <= ipe) { var vf = (Unsafe.ReadUnaligned <Vector4>(ip) * vlmax).Clamp(vzero, vlmax); float f0 = vf.X; float f1 = vf.Y; float f2 = vf.Z; float f3 = vf.W; uint i0 = (uint)f0; uint i1 = (uint)f1; uint i2 = (uint)f2; uint i3 = (uint)f3; ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0); ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1); ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2); ip[3] = Lerp(lp[i3], lp[i3 + 1], f3 - (int)i3); ip += 4; } ipe += 4; float fmin = vzero.X, flmax = vlmax.X; while (ip < ipe) { float f = (*ip * flmax).Clamp(fmin, flmax); uint i = (uint)f; *ip++ = Lerp(lp[i], lp[i + 1], f - i); } } }
unsafe public static void ConvertFloat3A(byte *ipstart, byte *opstart, float *lutstart, int lutmax, int cb) { Debug.Assert(ipstart == opstart); float *ip = (float *)ipstart, ipe = (float *)(ipstart + cb); float *lp = lutstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vgmsk = Avx.BroadcastVector128ToVector256((float *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.GatherMask3x))); var vgmax = Vector256.Create((float)lutmax); var vzero = Vector256 <float> .Zero; var vfone = Vector256.Create(1f); var vione = Vector256.Create(1); ipe -= Vector256 <float> .Count; while (ip <= ipe) { var vf = Avx.Max(vzero, Avx.LoadVector256(ip)); var va = Avx.Shuffle(vf, vf, HWIntrinsics.ShuffleMaskAlpha); vf = Avx.Multiply(vf, Avx.Multiply(vgmax, Avx.Reciprocal(va))); vf = Avx.Min(vf, vgmax); var vi = Avx.ConvertToVector256Int32WithTruncation(vf); var vfi = Avx.ConvertToVector256Single(vi); var vl = Avx2.GatherMaskVector256(vfone, lp, vi, vgmsk, sizeof(float)); var vh = Avx2.GatherMaskVector256(vfone, lp, Avx2.Add(vi, vione), vgmsk, sizeof(float)); vf = HWIntrinsics.Lerp(vl, vh, Avx.Subtract(vf, vfi)); vf = Avx.Multiply(vf, va); Avx.Store(ip, vf); ip += Vector256 <float> .Count; } ipe += Vector256 <float> .Count; } #endif { var vlmax = new Vector4(lutmax); var vzero = Vector4.Zero; float famin = new Vector4(1 / 1024f).X; while (ip < ipe) { var vf = Unsafe.ReadUnaligned <Vector4>(ip); float f3 = vf.W; if (f3 < famin) { Unsafe.WriteUnaligned(ip, vzero); } else { vf = (vf * vlmax / f3).Clamp(vzero, vlmax); float f0 = vf.X; float f1 = vf.Y; float f2 = vf.Z; uint i0 = (uint)f0; uint i1 = (uint)f1; uint i2 = (uint)f2; ip[0] = Lerp(lp[i0], lp[i0 + 1], f0 - (int)i0) * f3; ip[1] = Lerp(lp[i1], lp[i1 + 1], f1 - (int)i1) * f3; ip[2] = Lerp(lp[i2], lp[i2 + 1], f2 - (int)i2) * f3; } ip += 4; } } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &LookupTables.Alpha[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscale = Vector256.Create(1f / byte.MaxValue); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3); ip += Vector256 <byte> .Count; var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vf0 = Avx.Multiply(vf0, vfa0); vf1 = Avx.Multiply(vf1, vfa1); vf2 = Avx.Multiply(vf2, vfa2); vf3 = Avx.Multiply(vf3, vfa3); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); Avx.Store(op, vf0); Avx.Store(op + Vector256 <int> .Count, vf1); Avx.Store(op + Vector256 <int> .Count * 2, vf2); Avx.Store(op + Vector256 <int> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vscale = Vector128.Create(1f / byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3); ip += Vector128 <byte> .Count; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vf0 = Sse.Multiply(vf0, vfa0); vf1 = Sse.Multiply(vf1, vfa1); vf2 = Sse.Multiply(vf2, vfa2); vf3 = Sse.Multiply(vf3, vfa3); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); Sse.Store(op, vf0); Sse.Store(op + Vector128 <int> .Count, vf1); Sse.Store(op + Vector128 <int> .Count * 2, vf2); Sse.Store(op + Vector128 <int> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif while (ip < ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; float o3 = at[(uint)ip[3]]; ip += 4; op[0] = o0 * o3; op[1] = o1 * o3; op[2] = o2 * o3; op[3] = o3; op += 4; } } }