//↑をマルチスレッド化 private unsafe long Test17_Intrinsics_SSE41_DotProduct_float_MT(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count; int rangeSize = vs.Length / Environment.ProcessorCount; Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize), (range) => { long subtotal = 0; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); //vTotal = Sse.Add(vTotal, dp); subtotal += (long)dp.GetElement(0); } } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
private unsafe void Test44_Intrinsics_V128float_Sqrt(byte[] red, byte[] green, byte[] blue, float[] vv) { int simdLength = Vector128 <float> .Count; int lastIndex = red.Length - (red.Length % simdLength); float *tp = stackalloc float[simdLength]; //var zero = Vector128<float>.Zero; var vm = Vector128 <float> .Zero; fixed(byte *pR = red, pG = green, pB = blue) { for (int i = 0; i < lastIndex; i += simdLength) { var vr = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pG + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pR + i))); var vg = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pB + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pG + i))); var vb = Sse.Subtract(Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pR + i)), Sse2.ConvertToVector128Single(Sse41.ConvertToVector128Int32(pB + i))); vm = Sse.Add(Sse.Multiply(vr, vr), Sse.Multiply(vg, vg)); vm = Sse.Add(vm, Sse.Multiply(vb, vb)); vm = Sse.Sqrt(vm); Sse.Store(tp, vm); for (int m = 0; m < simdLength; m++) { vv[i + m] = tp[m]; } } } Amari(lastIndex, red.Length, red, green, blue, vv); }
//Intrinsics FMA MultiplyAdd double private unsafe long Test4_Intrinsics_FMA_MultiplyAdd_double(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count; int lastIndex = vs.Length - (vs.Length % simdLength); Vector256 <double> vTotal = Vector256.Create(0d); fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); Vector256 <double> f = Avx.ConvertToVector256Double(v); vTotal = Fma.MultiplyAdd(f, f, vTotal);//double } } double *pp = stackalloc double[Vector256 <double> .Count]; Avx.Store(pp, vTotal); for (int i = 0; i < Vector256 <double> .Count; i++) { total += (long)pp[i]; } for (int i = lastIndex; i < vs.Length; i++) { total += vs[i] * vs[i]; } return(total); }
private unsafe void Test2_Vector256Double(byte[] x, byte[] y, byte[] z, byte[] xx, byte[] yy, byte[] zz, double[] result) { Parallel.ForEach(Partitioner.Create(0, x.Length), range => { int simdLength = Vector256 <double> .Count; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector256 <double> vx, vy, vz, vm; fixed(byte *px = x, py = y, pz = z, pxx = xx, pyy = yy, pzz = zz) { fixed(double *dp = result) { for (int i = range.Item1; i < range.Item2; i += simdLength) { //引き算 vx = Avx.Subtract( Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(px + i)), Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pxx + i))); vy = Avx.Subtract( Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(py + i)), Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pyy + i))); vz = Avx.Subtract( Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pz + i)), Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pzz + i))); //2乗和の平方根 vm = Avx.Add(Avx.Multiply(vx, vx), Avx.Multiply(vy, vy)); vm = Avx.Sqrt(Avx.Add(vm, Avx.Multiply(vz, vz))); //結果を配列に書き込み Avx.Store(dp + i, vm); } } } }); }
public void RunFldScenario() { var result = Sse41.ConvertToVector128Int32(_fld); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld, _dataTable.outArrayPtr); }
//↑をオーバーフローしない程度に配列を分割して計算 private unsafe long Test28_Intrinsics_SSE41_DotProduct_float_MT_Kai(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count * 4; //集計用のVector128<float> vTotalで扱える最大要素数 = 1032 //floatの仮数部24bit / byte型最大値 * byte型最大値 //16777215 / (255 * 255) * 4 = 1032.0471 これの小数点以下切り捨てを //1区分あたりの要素数(分割サイズ) int rangeSize = ((1 << 24) - 1) / (byte.MaxValue * byte.MaxValue) * Vector128 <float> .Count;//1032 Parallel.ForEach( Partitioner.Create(0, vs.Length, rangeSize), (range) => { var vTotal = Vector128 <float> .Zero; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 4); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 8); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 12); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる vTotal = Sse.Add(vTotal, dp); } } long subtotal = 0; float *f = stackalloc float[Vector128 <float> .Count]; Sse.Store(f, vTotal); for (int i = 0; i < Vector128 <float> .Count; i++) { subtotal += (long)f[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
public void RunLclVarScenario_LoadAligned() { var firstOp = Sse2.LoadAlignedVector128((Int16 *)(_dataTable.inArrayPtr)); var result = Sse41.ConvertToVector128Int32(firstOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { var firstOp = Unsafe.Read <Vector128 <Int16> >(_dataTable.inArrayPtr); var result = Sse41.ConvertToVector128Int32(firstOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, _dataTable.outArrayPtr); }
public void RunLclFldScenario() { var test = new SimpleUnaryOpTest__ConvertToVector128Int32Int16(); var result = Sse41.ConvertToVector128Int32(test._fld); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Sse41.ConvertToVector128Int32( Unsafe.Read <Vector128 <Int16> >(_dataTable.inArrayPtr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
public void RunBasicScenario_Ptr() { var result = Sse41.ConvertToVector128Int32( (SByte *)(_dataTable.inArrayPtr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Sse41.ConvertToVector128Int32( Sse2.LoadAlignedVector128((Int16 *)(_dataTable.inArrayPtr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
private unsafe ulong HashSse(byte *buf, int len) { ulong h = 0; Vector128 <int> v_ps = Vector128 <int> .Zero; bool useSse4 = Sse41.IsSupported; int i = 0; for (int j = len - i - 1; len - i >= 4; i += 4, j = len - i - 1) { Vector128 <int> c_v = Sse2.LoadVector128(&kMultFactorsPtr[j - 3]); c_v = Sse2.Shuffle(c_v, SO123); Vector128 <byte> q_v = Sse2.LoadVector128(buf + i); Vector128 <int> s_v; if (useSse4) { s_v = Sse41.ConvertToVector128Int32(q_v); } else { q_v = Sse2.UnpackLow(q_v, q_v); s_v = Sse2.ShiftRightLogical(Sse2.UnpackLow(q_v.AsUInt16(), q_v.AsUInt16()).AsInt32(), 24); } if (useSse4) { v_ps = Sse2.Add(v_ps, Sse41.MultiplyLow(c_v, s_v)); } else { Vector128 <ulong> v_tmp1 = Sse2.Multiply(c_v.AsUInt32(), s_v.AsUInt32()); Vector128 <ulong> v_tmp2 = Sse2.Multiply(Sse2.ShiftRightLogical128BitLane(c_v.AsByte(), 4).AsUInt32(), Sse2.ShiftRightLogical128BitLane(s_v.AsByte(), 4).AsUInt32()); ; v_ps = Sse2.Add(v_ps, Sse2.UnpackLow(Sse2.Shuffle(v_tmp1.AsInt32(), SOO2O), Sse2.Shuffle(v_tmp2.AsInt32(), SOO2O))); } } v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S23O1)); v_ps = Sse2.Add(v_ps, Sse2.Shuffle(v_ps, S1O32)); h += Sse2.ConvertToUInt32(v_ps.AsUInt32()); for (; i < len; i++) { int index = len - i - 1; ulong c = (uint)kMultFactors[index]; h += c * buf[i]; } return(h & (kBase - 1)); }
public RgbaColor32 GetColor32() { if (Sse41.IsSupported) { Vector128 <byte> color = Vector128.CreateScalarUnsafe(Unsafe.As <RgbaColor8, uint>(ref this)).AsByte(); return(new RgbaColor32(Sse41.ConvertToVector128Int32(color))); } else { return(new RgbaColor32(R, G, B, A)); } }
//4倍速、コンバーターを使ってVector作成 private unsafe void Test6(byte[] vs) { int simdLength = Vector256 <double> .Count; int lastIndex = vs.Length - (vs.Length % simdLength); fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { _ = Avx.Sqrt(Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(p))); } } }
private unsafe void TestAddSum(byte[] vs) { fixed(byte *p = vs) { var v = Avx.LoadVector256(p); var v2 = Avx.LoadVector256(p + 32); //Avx.MultipleSumAbsoluteDifferences; Vector256 <int> i1 = Avx2.ConvertToVector256Int32(p); Vector256 <float> f1 = Avx.ConvertToVector256Single(i1); Vector256 <float> m1 = Avx.Multiply(f1, f1); Vector128 <int> i128 = Sse41.ConvertToVector128Int32(p); Vector256 <double> d256 = Avx.ConvertToVector256Double(i128); var dZero = Vector256 <double> .Zero; Vector256 <double> ma1 = Fma.MultiplyAdd(d256, d256, dZero); var i256 = Avx2.ConvertToVector256Int32(p); var f256 = Avx.ConvertToVector256Single(i256); var fZero = Vector256 <float> .Zero; var ma2 = Fma.MultiplyAdd(f256, f256, fZero); Vector128 <float> s128 = Sse2.ConvertToVector128Single(i128); Vector128 <float> ms = Sse.MultiplyScalar(s128, s128); // x86 / x64 SIMD命令一覧表(SSE~AVX2) //https://www.officedaytime.com/tips/simd.html // pmaddwd //https://www.officedaytime.com/tips/simdimg/si.php?f=pmaddwd Vector128 <short> sh128 = Sse41.ConvertToVector128Int16(p); Vector128 <int> vv3 = Avx.MultiplyAddAdjacent(sh128, sh128); var neko = 0; //Avx.MultiplyAddAdjacent; //Avx.MultiplyHigh; //Avx.MultiplyHighRoundScale; //Avx.MultiplyLow; //Avx.MultiplyScalar; //Fma.MultiplyAdd; //Fma.MultiplyAddNegated; //Fma.MultiplyAddNegatedScalar; //Fma.MultiplyAddScalar; //Fma.MultiplyAddSubtract; //Fma.MultiplySubtract; //Fma.MultiplySubtractAdd; //Fma.MultiplySubtractNegated; //Fma.MultiplySubtractNegatedScalar; //Fma.MultiplySubtractScalar; } }
public static unsafe void CalculateDiagonalSection_Sse41 <T>(void *refDiag1Ptr, void *refDiag2Ptr, char *sourcePtr, char *targetPtr, ref int rowIndex, int columnIndex) where T : struct { if (typeof(T) == typeof(int)) { var diag1Ptr = (int *)refDiag1Ptr; var diag2Ptr = (int *)refDiag2Ptr; var sourceVector = Sse41.ConvertToVector128Int32((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count); var targetVector = Sse41.ConvertToVector128Int32((ushort *)targetPtr + columnIndex - 1); targetVector = Sse2.Shuffle(targetVector, 0x1b); var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector); var substitutionCost = Sse2.Add( Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count), substitutionCostAdjustment ); var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1)); var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count); var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost); localCost = Sse2.Add(localCost, Vector128.Create(1)); Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost); } else if (typeof(T) == typeof(ushort)) { var diag1Ptr = (ushort *)refDiag1Ptr; var diag2Ptr = (ushort *)refDiag2Ptr; var sourceVector = Sse3.LoadDquVector128((ushort *)sourcePtr + rowIndex - Vector128 <T> .Count); var targetVector = Sse3.LoadDquVector128((ushort *)targetPtr + columnIndex - 1); targetVector = Ssse3.Shuffle(targetVector.AsByte(), REVERSE_USHORT_AS_BYTE_128).AsUInt16(); var substitutionCostAdjustment = Sse2.CompareEqual(sourceVector, targetVector); var substitutionCost = Sse2.Add( Sse3.LoadDquVector128(diag1Ptr + rowIndex - Vector128 <T> .Count), substitutionCostAdjustment ); var deleteCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - (Vector128 <T> .Count - 1)); var insertCost = Sse3.LoadDquVector128(diag2Ptr + rowIndex - Vector128 <T> .Count); var localCost = Sse41.Min(Sse41.Min(insertCost, deleteCost), substitutionCost); localCost = Sse2.Add(localCost, Vector128.Create((ushort)1)); Sse2.Store(diag1Ptr + rowIndex - (Vector128 <T> .Count - 1), localCost); } }
//12倍速、やっぱりVectorのSqrtは速い private unsafe void Test6_MT(byte[] vs) { Parallel.ForEach(Partitioner.Create(0, ELEMENT_COUNT), range => { int simdLength = Vector256 <double> .Count; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; fixed(byte *p = vs) { for (int i = range.Item1; i < range.Item2; i += simdLength) { _ = Avx.Sqrt(Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(p))); } } }); }
//Intrinsics SSE41 DotProduct、ループの中で4個づつ処理 private unsafe long Test8_Intrinsics_SSE41_DotProduct_float(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count * 4; int lastIndex = vs.Length - (vs.Length % simdLength); var vTotal = Vector128 <float> .Zero; fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 4); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110010);//結果を1番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 8); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11110100);//結果を2番目に入れる vTotal = Sse.Add(vTotal, dp); v = Sse41.ConvertToVector128Int32(p + i + 12); vv = Sse2.ConvertToVector128Single(v); dp = Sse41.DotProduct(vv, vv, 0b11111000);//結果を3番目に入れる vTotal = Sse.Add(vTotal, dp); } } float *f = stackalloc float[Vector128 <int> .Count]; Sse.Store(f, vTotal); for (int i = 0; i < Vector128 <int> .Count; i++) { total += (long)f[i]; } for (int i = lastIndex; i < vs.Length; i++) { total += vs[i] * vs[i]; } return(total); }
// x86/x64 SIMD命令一覧表 (SSE~AVX2) //https://www.officedaytime.com/tips/simd.html //算術演算 ドット積 DPPS //Intrinsics SSE41 DotProduct private unsafe long Test7_Intrinsics_SSE41_DotProduct_float(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count; int lastIndex = vs.Length - (vs.Length % simdLength); fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); var vv = Sse2.ConvertToVector128Single(v); //4要素全てを掛け算(5~8bit目を1)して、足し算した結果を0番目に入れる(1bit目を1) Vector128 <float> dp = Sse41.DotProduct(vv, vv, 0b11110001); total += (long)dp.GetElement(0); } } for (int i = lastIndex; i < vs.Length; i++) { total += vs[i] * vs[i]; } return(total); }
public static unsafe Vector128 <int> xmm__1(sbyte *address) { return(Sse41.ConvertToVector128Int32(address)); }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &LookupTables.Alpha[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscale = Vector256.Create(1f / byte.MaxValue); var vmaskp = Avx.LoadVector256((int *)Unsafe.AsPointer(ref MemoryMarshal.GetReference(HWIntrinsics.PermuteMask3To3xChan))); ipe -= Vector256 <byte> .Count * 3 / 4 + 2; // +2 accounts for the overrun on the last read while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3 / 4); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 6 / 4); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 9 / 4); ip += Vector256 <byte> .Count * 3 / 4; vi0 = Avx2.PermuteVar8x32(vi0, vmaskp); vi1 = Avx2.PermuteVar8x32(vi1, vmaskp); vi2 = Avx2.PermuteVar8x32(vi2, vmaskp); vi3 = Avx2.PermuteVar8x32(vi3, vmaskp); var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); Avx.Store(op, vf0); Avx.Store(op + Vector256 <float> .Count, vf1); Avx.Store(op + Vector256 <float> .Count * 2, vf2); Avx.Store(op + Vector256 <float> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count * 3 / 4 + 2; } else if (Sse41.IsSupported) { var vscale = Vector128.Create(1f / byte.MaxValue); ipe -= Vector128 <byte> .Count * 3 / 4 + 1; // +1 accounts for the overrun on the last read while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3 / 4); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 6 / 4); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 9 / 4); ip += Vector128 <byte> .Count * 3 / 4; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); Sse.Store(op, vf0); Sse.Store(op + Vector128 <float> .Count, vf1); Sse.Store(op + Vector128 <float> .Count * 2, vf2); Sse.Store(op + Vector128 <float> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count * 3 / 4 + 1; } #endif while (ip < ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; ip += 3; op[0] = o0; op[1] = o1; op[2] = o2; op += 4; } } }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &valueTable[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscal = Vector256.Create(scale); var voffs = Fma.IsSupported ? Vector256.Create(offset * scale) : Vector256.Create(offset); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3); ip += Vector256 <byte> .Count; var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); if (Fma.IsSupported) { vf0 = Fma.MultiplySubtract(vf0, vscal, voffs); vf1 = Fma.MultiplySubtract(vf1, vscal, voffs); vf2 = Fma.MultiplySubtract(vf2, vscal, voffs); vf3 = Fma.MultiplySubtract(vf3, vscal, voffs); } else { vf0 = Avx.Multiply(Avx.Subtract(vf0, voffs), vscal); vf1 = Avx.Multiply(Avx.Subtract(vf1, voffs), vscal); vf2 = Avx.Multiply(Avx.Subtract(vf2, voffs), vscal); vf3 = Avx.Multiply(Avx.Subtract(vf3, voffs), vscal); } Avx.Store(op, vf0); Avx.Store(op + Vector256 <int> .Count, vf1); Avx.Store(op + Vector256 <int> .Count * 2, vf2); Avx.Store(op + Vector256 <int> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vscal = Vector128.Create(scale); var voffs = Vector128.Create(offset); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3); ip += Vector128 <byte> .Count; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(Sse.Subtract(vf0, voffs), vscal); vf1 = Sse.Multiply(Sse.Subtract(vf1, voffs), vscal); vf2 = Sse.Multiply(Sse.Subtract(vf2, voffs), vscal); vf3 = Sse.Multiply(Sse.Subtract(vf3, voffs), vscal); Sse.Store(op, vf0); Sse.Store(op + Vector128 <int> .Count, vf1); Sse.Store(op + Vector128 <int> .Count * 2, vf2); Sse.Store(op + Vector128 <int> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #elif VECTOR_CONVERT var vscal = new VectorF(scale); var voffs = new VectorF(offset); ipe -= Vector <byte> .Count; while (ip <= ipe) { var vb = Unsafe.ReadUnaligned <Vector <byte> >(ip); Vector.Widen(vb, out var vs0, out var vs1); Vector.Widen(vs0, out var vi0, out var vi1); Vector.Widen(vs1, out var vi2, out var vi3); ip += Vector <byte> .Count; var vf0 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi0)); var vf1 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi1)); var vf2 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi2)); var vf3 = Vector.ConvertToSingle(Vector.AsVectorInt32(vi3)); vf0 = (vf0 - voffs) * vscal; vf1 = (vf1 - voffs) * vscal; vf2 = (vf2 - voffs) * vscal; vf3 = (vf3 - voffs) * vscal; Unsafe.WriteUnaligned(op, vf0); Unsafe.WriteUnaligned(op + VectorF.Count, vf1); Unsafe.WriteUnaligned(op + VectorF.Count * 2, vf2); Unsafe.WriteUnaligned(op + VectorF.Count * 3, vf3); op += Vector <byte> .Count; } ipe += Vector <byte> .Count; #endif ipe -= 8; while (ip <= ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; float o3 = at[(uint)ip[3]]; float o4 = at[(uint)ip[4]]; float o5 = at[(uint)ip[5]]; float o6 = at[(uint)ip[6]]; float o7 = at[(uint)ip[7]]; ip += 8; op[0] = o0; op[1] = o1; op[2] = o2; op[3] = o3; op[4] = o4; op[5] = o5; op[6] = o6; op[7] = o7; op += 8; } ipe += 8; while (ip < ipe) { op[0] = at[(uint)ip[0]]; ip++; op++; } } }
private unsafe static void ResampleDefaultQuality(Span <float> outputBuffer, ReadOnlySpan <short> inputBuffer, float ratio, ref float fraction, int sampleCount, bool needPitch) { ReadOnlySpan <float> parameters = GetDefaultParameter(ratio); int inputBufferIndex = 0, i = 0; // TODO: REV8 fast path (when needPitch == false the input index progression is constant + we need SIMD) if (Sse41.IsSupported) { if (ratio == 1f) { fixed(short *pInput = inputBuffer) { fixed(float *pOutput = outputBuffer, pParameters = parameters) { Vector128 <float> parameter = Sse.LoadVector128(pParameters); for (; i < (sampleCount & ~3); i += 4) { Vector128 <int> intInput0 = Sse41.ConvertToVector128Int32(pInput + (uint)i); Vector128 <int> intInput1 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 1); Vector128 <int> intInput2 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 2); Vector128 <int> intInput3 = Sse41.ConvertToVector128Int32(pInput + (uint)i + 3); Vector128 <float> input0 = Sse2.ConvertToVector128Single(intInput0); Vector128 <float> input1 = Sse2.ConvertToVector128Single(intInput1); Vector128 <float> input2 = Sse2.ConvertToVector128Single(intInput2); Vector128 <float> input3 = Sse2.ConvertToVector128Single(intInput3); Vector128 <float> mix0 = Sse.Multiply(input0, parameter); Vector128 <float> mix1 = Sse.Multiply(input1, parameter); Vector128 <float> mix2 = Sse.Multiply(input2, parameter); Vector128 <float> mix3 = Sse.Multiply(input3, parameter); Vector128 <float> mix01 = Sse3.HorizontalAdd(mix0, mix1); Vector128 <float> mix23 = Sse3.HorizontalAdd(mix2, mix3); Vector128 <float> mix0123 = Sse3.HorizontalAdd(mix01, mix23); Sse.Store(pOutput + (uint)i, Sse41.RoundToNearestInteger(mix0123)); } } } inputBufferIndex = i; } else { fixed(short *pInput = inputBuffer) { fixed(float *pOutput = outputBuffer, pParameters = parameters) { for (; i < (sampleCount & ~3); i += 4) { uint baseIndex0 = (uint)(fraction * 128) * 4; uint inputIndex0 = (uint)inputBufferIndex; fraction += ratio; uint baseIndex1 = ((uint)(fraction * 128) & 127) * 4; uint inputIndex1 = (uint)inputBufferIndex + (uint)fraction; fraction += ratio; uint baseIndex2 = ((uint)(fraction * 128) & 127) * 4; uint inputIndex2 = (uint)inputBufferIndex + (uint)fraction; fraction += ratio; uint baseIndex3 = ((uint)(fraction * 128) & 127) * 4; uint inputIndex3 = (uint)inputBufferIndex + (uint)fraction; fraction += ratio; inputBufferIndex += (int)fraction; // Only keep lower part (safe as fraction isn't supposed to be negative) fraction -= (int)fraction; Vector128 <float> parameter0 = Sse.LoadVector128(pParameters + baseIndex0); Vector128 <float> parameter1 = Sse.LoadVector128(pParameters + baseIndex1); Vector128 <float> parameter2 = Sse.LoadVector128(pParameters + baseIndex2); Vector128 <float> parameter3 = Sse.LoadVector128(pParameters + baseIndex3); Vector128 <int> intInput0 = Sse41.ConvertToVector128Int32(pInput + inputIndex0); Vector128 <int> intInput1 = Sse41.ConvertToVector128Int32(pInput + inputIndex1); Vector128 <int> intInput2 = Sse41.ConvertToVector128Int32(pInput + inputIndex2); Vector128 <int> intInput3 = Sse41.ConvertToVector128Int32(pInput + inputIndex3); Vector128 <float> input0 = Sse2.ConvertToVector128Single(intInput0); Vector128 <float> input1 = Sse2.ConvertToVector128Single(intInput1); Vector128 <float> input2 = Sse2.ConvertToVector128Single(intInput2); Vector128 <float> input3 = Sse2.ConvertToVector128Single(intInput3); Vector128 <float> mix0 = Sse.Multiply(input0, parameter0); Vector128 <float> mix1 = Sse.Multiply(input1, parameter1); Vector128 <float> mix2 = Sse.Multiply(input2, parameter2); Vector128 <float> mix3 = Sse.Multiply(input3, parameter3); Vector128 <float> mix01 = Sse3.HorizontalAdd(mix0, mix1); Vector128 <float> mix23 = Sse3.HorizontalAdd(mix2, mix3); Vector128 <float> mix0123 = Sse3.HorizontalAdd(mix01, mix23); Sse.Store(pOutput + (uint)i, Sse41.RoundToNearestInteger(mix0123)); } } } } } for (; i < sampleCount; i++) { int baseIndex = (int)(fraction * 128) * 4; ReadOnlySpan <float> parameter = parameters.Slice(baseIndex, 4); ReadOnlySpan <short> currentInput = inputBuffer.Slice(inputBufferIndex, 4); outputBuffer[i] = (float)Math.Round(currentInput[0] * parameter[0] + currentInput[1] * parameter[1] + currentInput[2] * parameter[2] + currentInput[3] * parameter[3]); fraction += ratio; inputBufferIndex += (int)fraction; // Only keep lower part (safe as fraction isn't supposed to be negative) fraction -= (int)fraction; } }
unsafe private void remapDitherSse2(byte *pimage, int *perr, byte *pout, uint *pilut, OctreeNode *ptree, uint *ppal, ref nuint nextFree, nint cp) { var transnode = new OctreeNode(); transnode.Sums[3] = byte.MaxValue; var vpmax = Vector128.Create((int)byte.MaxValue); var vprnd = Vector128.Create(7); var vzero = Vector128 <int> .Zero; nuint level = leafLevel; var prnod = default(OctreeNode *); byte *ip = pimage, ipe = ip + cp * sizeof(uint); byte *op = pout; int * ep = perr; var vppix = vzero; var vperr = vzero; var vnerr = vzero; do { Vector128 <int> vpix, vdiff; if ((byte)ip[3] < alphaThreshold) { vppix = vzero; vdiff = vzero; prnod = &transnode; goto FoundExact; } if (Sse41.IsSupported) { vpix = Sse41.ConvertToVector128Int32(ip); } else { vpix = Sse2.UnpackLow(Sse2.UnpackLow(Sse2.LoadScalarVector128((int *)ip).AsByte(), vzero.AsByte()).AsInt16(), vzero.AsInt16()).AsInt32(); } var verr = Sse2.Add(Sse2.Add(vprnd, Sse2.LoadVector128(ep)), Sse2.Subtract(Sse2.ShiftLeftLogical(vnerr, 3), vnerr)); vpix = Sse2.Add(vpix, Sse2.ShiftRightArithmetic(verr, 4)); vpix = Sse2.Min(vpix.AsInt16(), vpmax.AsInt16()).AsInt32(); vpix = Sse2.Max(vpix.AsInt16(), vzero.AsInt16()).AsInt32(); if (Sse2.MoveMask(Sse2.CompareEqual(vppix, vpix).AsByte()) == ushort.MaxValue) { vdiff = vzero; goto FoundExact; } vppix = vpix; nuint idx = pilut[(nuint)Sse2.ConvertToUInt32(vppix.AsUInt32())] | pilut[(nuint)Sse2.Extract(vppix.AsUInt16(), 2) + 256] | pilut[(nuint)Sse2.Extract(vppix.AsUInt16(), 4) + 512]; nuint next = idx & 7; var pnode = ptree + next; for (nuint i = 0; i <= level; i++) { idx >>= 3; nuint child = idx & 7; ushort *children = (ushort *)pnode; next = children[child]; if (next == 0) { uint *sums = (uint *)(children + 8); if (i < minLeafLevel) { next = nextFree++; children[child] = (ushort)next; pnode = ptree + next; if (i == minLeafLevel - 1) { initNode(pnode, vppix); break; } else { uint *csums = (uint *)((ushort *)pnode + 8); csums[3] = byte.MaxValue; } } else if ((byte)sums[3] == byte.MaxValue) { for (nuint j = 1; j < 8; j++) { nuint sibling = children[child ^ j]; if (sibling != 0) { var snode = ptree + sibling; uint *ssums = (uint *)((ushort *)snode + 8); if ((byte)ssums[3] == byte.MaxValue) { next = sibling; nuint mask = child ^ sibling; idx = (child & mask) | (idx & ~mask); break; } else { prnod = snode; goto Found; } } } } else { break; } } pnode = ptree + next; } prnod = pnode; Found: vdiff = Sse2.Subtract(vppix, Sse2.LoadVector128((int *)((ushort *)prnod + 8))); FoundExact: int *psums = (int *)((ushort *)prnod + 8); ip += sizeof(uint); *op++ = (byte)psums[3]; Sse2.Store(ep - Vector128 <int> .Count, Sse2.Add(vperr, Sse2.Add(vdiff, vdiff))); ep += Vector128 <int> .Count; vperr = Sse2.Add(Sse2.ShiftLeftLogical(vdiff, 2), vnerr); vnerr = vdiff; } while (ip < ipe); Sse2.Store(ep - Vector128 <int> .Count, vperr); }
unsafe void IConversionProcessor.ConvertLine(byte *ipstart, byte *opstart, int cb) { fixed(float *atstart = &LookupTables.Alpha[0]) { byte * ip = ipstart, ipe = ipstart + cb; float *op = (float *)opstart, at = atstart; #if HWINTRINSICS if (Avx2.IsSupported) { var vscale = Vector256.Create(1f / byte.MaxValue); ipe -= Vector256 <byte> .Count; while (ip <= ipe) { var vi0 = Avx2.ConvertToVector256Int32(ip); var vi1 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count); var vi2 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 2); var vi3 = Avx2.ConvertToVector256Int32(ip + Vector256 <int> .Count * 3); ip += Vector256 <byte> .Count; var vf0 = Avx.ConvertToVector256Single(vi0); var vf1 = Avx.ConvertToVector256Single(vi1); var vf2 = Avx.ConvertToVector256Single(vi2); var vf3 = Avx.ConvertToVector256Single(vi3); vf0 = Avx.Multiply(vf0, vscale); vf1 = Avx.Multiply(vf1, vscale); vf2 = Avx.Multiply(vf2, vscale); vf3 = Avx.Multiply(vf3, vscale); var vfa0 = Avx.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Avx.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Avx.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Avx.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vf0 = Avx.Multiply(vf0, vfa0); vf1 = Avx.Multiply(vf1, vfa1); vf2 = Avx.Multiply(vf2, vfa2); vf3 = Avx.Multiply(vf3, vfa3); vf0 = Avx.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Avx.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Avx.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Avx.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); Avx.Store(op, vf0); Avx.Store(op + Vector256 <int> .Count, vf1); Avx.Store(op + Vector256 <int> .Count * 2, vf2); Avx.Store(op + Vector256 <int> .Count * 3, vf3); op += Vector256 <byte> .Count; } ipe += Vector256 <byte> .Count; } else if (Sse41.IsSupported) { var vscale = Vector128.Create(1f / byte.MaxValue); ipe -= Vector128 <byte> .Count; while (ip <= ipe) { var vi0 = Sse41.ConvertToVector128Int32(ip); var vi1 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count); var vi2 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 2); var vi3 = Sse41.ConvertToVector128Int32(ip + Vector128 <int> .Count * 3); ip += Vector128 <byte> .Count; var vf0 = Sse2.ConvertToVector128Single(vi0); var vf1 = Sse2.ConvertToVector128Single(vi1); var vf2 = Sse2.ConvertToVector128Single(vi2); var vf3 = Sse2.ConvertToVector128Single(vi3); vf0 = Sse.Multiply(vf0, vscale); vf1 = Sse.Multiply(vf1, vscale); vf2 = Sse.Multiply(vf2, vscale); vf3 = Sse.Multiply(vf3, vscale); var vfa0 = Sse.Shuffle(vf0, vf0, HWIntrinsics.ShuffleMaskAlpha); var vfa1 = Sse.Shuffle(vf1, vf1, HWIntrinsics.ShuffleMaskAlpha); var vfa2 = Sse.Shuffle(vf2, vf2, HWIntrinsics.ShuffleMaskAlpha); var vfa3 = Sse.Shuffle(vf3, vf3, HWIntrinsics.ShuffleMaskAlpha); vf0 = Sse.Multiply(vf0, vfa0); vf1 = Sse.Multiply(vf1, vfa1); vf2 = Sse.Multiply(vf2, vfa2); vf3 = Sse.Multiply(vf3, vfa3); vf0 = Sse41.Blend(vf0, vfa0, HWIntrinsics.BlendMaskAlpha); vf1 = Sse41.Blend(vf1, vfa1, HWIntrinsics.BlendMaskAlpha); vf2 = Sse41.Blend(vf2, vfa2, HWIntrinsics.BlendMaskAlpha); vf3 = Sse41.Blend(vf3, vfa3, HWIntrinsics.BlendMaskAlpha); Sse.Store(op, vf0); Sse.Store(op + Vector128 <int> .Count, vf1); Sse.Store(op + Vector128 <int> .Count * 2, vf2); Sse.Store(op + Vector128 <int> .Count * 3, vf3); op += Vector128 <byte> .Count; } ipe += Vector128 <byte> .Count; } #endif while (ip < ipe) { float o0 = at[(uint)ip[0]]; float o1 = at[(uint)ip[1]]; float o2 = at[(uint)ip[2]]; float o3 = at[(uint)ip[3]]; ip += 4; op[0] = o0 * o3; op[1] = o1 * o3; op[2] = o2 * o3; op[3] = o3; op += 4; } } }
public static Vector128 <int> _mm_cvtepu8_epi32(Vector128 <byte> value) { return(Sse41.ConvertToVector128Int32(value)); }
public static unsafe Vector128 <int> xmm(ushort *address) { return(Sse41.ConvertToVector128Int32(address)); }
public static Vector128 <int> _mm_cvtepu16_epi32(Vector128 <ushort> value) { return(Sse41.ConvertToVector128Int32(value)); }
public unsafe void Process(MutableByteImage currentPicture, MutableByteImage nextPicture) { float MaxFactor = 1; float[] attackAr = new float[] { Attack, Attack, Attack, Attack }; float[] decayAr = new float[] { Decay, Decay, Decay, Decay }; int length = nextPicture.Data.Length; float *MaxFactorPtr = &MaxFactor; fixed(float *AttackPtr = attackAr) fixed(float *DecayPtr = decayAr) fixed(byte *currentPicPtr = currentPicture.Data) fixed(byte *nextPicPtr = nextPicture.Data) { byte *currentPxPtr = currentPicPtr; byte *nextPxPtr = nextPicPtr; int remainingLength = length % 4; for (int i = 0; i < length; i += 4) { var currentColor = *nextPxPtr; var workingDataColor = *currentPxPtr; var currentColorPtr = nextPxPtr; var workingDataColorPtr = currentPxPtr; var cmpResult = Avx.ConvertToVector128Single( Sse2.CompareGreaterThan( Sse41.ConvertToVector128Int32(currentColorPtr), Sse41.ConvertToVector128Int32(workingDataColorPtr) )); var pixelFactor = Avx.Add( Avx.And(cmpResult, Avx.BroadcastScalarToVector128(AttackPtr)), Avx.AndNot(cmpResult, Avx.BroadcastScalarToVector128(DecayPtr)) ); var result = Avx.Add( Avx.Multiply( Avx.Subtract( Avx.BroadcastScalarToVector128(MaxFactorPtr), pixelFactor), Sse41.ConvertToVector128Single( Sse41.ConvertToVector128Int32(workingDataColorPtr)) ), Avx.Multiply( pixelFactor, Sse41.ConvertToVector128Single( Sse41.ConvertToVector128Int32(currentColorPtr)))); // TODO improve Store *currentPxPtr = (byte)Avx.Extract(result, 0); currentPxPtr++; *currentPxPtr = (byte)Avx.Extract(result, 1); currentPxPtr++; *currentPxPtr = (byte)Avx.Extract(result, 2); currentPxPtr++; *currentPxPtr = (byte)Avx.Extract(result, 3); currentPxPtr++; nextPxPtr += 4; } for (int i = 0; i < remainingLength; i++) { var currentColor = *nextPxPtr; var workingDataColor = *currentPxPtr; var newPixelFactor = workingDataColor < currentColor ? Attack : Decay; var newPixelValue = (byte)((currentColor * newPixelFactor) + (workingDataColor * (1 - newPixelFactor))); *currentPxPtr = newPixelValue; currentPxPtr++; nextPxPtr++; } } }