//↑をマルチスレッド化 //Intrinsics FMA MultiplyAdd double private unsafe long Test14_Intrinsics_FMA_MultiplyAdd_double_MT(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count; int rangeSize = vs.Length / Environment.ProcessorCount; Parallel.ForEach(Partitioner.Create(0, vs.Length, rangeSize), (range) => { long subtotal = 0; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector256 <double> vTotal = Vector256.Create(0d); fixed(byte *p = vs) { for (int i = range.Item1; i < lastIndex; i += simdLength) { Vector128 <int> v = Avx2.ConvertToVector128Int32(p + i); Vector256 <double> f = Avx.ConvertToVector256Double(v); vTotal = Fma.MultiplyAdd(f, f, vTotal); //float } } double *pp = stackalloc double[Vector256 <double> .Count]; Avx.Store(pp, vTotal); for (int i = 0; i < Vector256 <double> .Count; i++) { subtotal += (long)pp[i]; } for (int i = lastIndex; i < range.Item2; i++) { subtotal += vs[i] * vs[i]; } System.Threading.Interlocked.Add(ref total, subtotal); }); return(total); }
public void RunFldScenario() { var result = Avx.ConvertToVector256Double(_fld); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_fld, _dataTable.outArrayPtr); }
//Intrinsics FMA MultiplyAdd double private unsafe long Test4_Intrinsics_FMA_MultiplyAdd_double(byte[] vs) { long total = 0; int simdLength = Vector128 <int> .Count; int lastIndex = vs.Length - (vs.Length % simdLength); Vector256 <double> vTotal = Vector256.Create(0d); fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { Vector128 <int> v = Sse41.ConvertToVector128Int32(p + i); Vector256 <double> f = Avx.ConvertToVector256Double(v); vTotal = Fma.MultiplyAdd(f, f, vTotal);//double } } double *pp = stackalloc double[Vector256 <double> .Count]; Avx.Store(pp, vTotal); for (int i = 0; i < Vector256 <double> .Count; i++) { total += (long)pp[i]; } for (int i = lastIndex; i < vs.Length; i++) { total += vs[i] * vs[i]; } return(total); }
private unsafe void Test2_Vector256Double(byte[] x, byte[] y, byte[] z, byte[] xx, byte[] yy, byte[] zz, double[] result) { Parallel.ForEach(Partitioner.Create(0, x.Length), range => { int simdLength = Vector256 <double> .Count; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; Vector256 <double> vx, vy, vz, vm; fixed(byte *px = x, py = y, pz = z, pxx = xx, pyy = yy, pzz = zz) { fixed(double *dp = result) { for (int i = range.Item1; i < range.Item2; i += simdLength) { //引き算 vx = Avx.Subtract( Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(px + i)), Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pxx + i))); vy = Avx.Subtract( Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(py + i)), Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pyy + i))); vz = Avx.Subtract( Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pz + i)), Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(pzz + i))); //2乗和の平方根 vm = Avx.Add(Avx.Multiply(vx, vx), Avx.Multiply(vy, vy)); vm = Avx.Sqrt(Avx.Add(vm, Avx.Multiply(vz, vz))); //結果を配列に書き込み Avx.Store(dp + i, vm); } } } }); }
public void RunLclFldScenario() { var test = new SimpleUnaryOpTest__ConvertToVector256DoubleSingle(); var result = Avx.ConvertToVector256Double(test._fld); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(test._fld, _dataTable.outArrayPtr); }
public void RunLclVarScenario_LoadAligned() { var firstOp = Sse.LoadAlignedVector128((Single *)(_dataTable.inArrayPtr)); var result = Avx.ConvertToVector256Double(firstOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, _dataTable.outArrayPtr); }
public void RunLclVarScenario_UnsafeRead() { var firstOp = Unsafe.Read <Vector128 <Single> >(_dataTable.inArrayPtr); var result = Avx.ConvertToVector256Double(firstOp); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(firstOp, _dataTable.outArrayPtr); }
public void RunBasicScenario_LoadAligned() { var result = Avx.ConvertToVector256Double( Sse.LoadAlignedVector128((Single *)(_dataTable.inArrayPtr)) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
public void RunBasicScenario_UnsafeRead() { var result = Avx.ConvertToVector256Double( Unsafe.Read <Vector128 <Single> >(_dataTable.inArrayPtr) ); Unsafe.Write(_dataTable.outArrayPtr, result); ValidateResult(_dataTable.inArrayPtr, _dataTable.outArrayPtr); }
//4倍速、コンバーターを使ってVector作成 private unsafe void Test6(byte[] vs) { int simdLength = Vector256 <double> .Count; int lastIndex = vs.Length - (vs.Length % simdLength); fixed(byte *p = vs) { for (int i = 0; i < lastIndex; i += simdLength) { _ = Avx.Sqrt(Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(p))); } } }
private unsafe void TestAddSum(byte[] vs) { fixed(byte *p = vs) { var v = Avx.LoadVector256(p); var v2 = Avx.LoadVector256(p + 32); //Avx.MultipleSumAbsoluteDifferences; Vector256 <int> i1 = Avx2.ConvertToVector256Int32(p); Vector256 <float> f1 = Avx.ConvertToVector256Single(i1); Vector256 <float> m1 = Avx.Multiply(f1, f1); Vector128 <int> i128 = Sse41.ConvertToVector128Int32(p); Vector256 <double> d256 = Avx.ConvertToVector256Double(i128); var dZero = Vector256 <double> .Zero; Vector256 <double> ma1 = Fma.MultiplyAdd(d256, d256, dZero); var i256 = Avx2.ConvertToVector256Int32(p); var f256 = Avx.ConvertToVector256Single(i256); var fZero = Vector256 <float> .Zero; var ma2 = Fma.MultiplyAdd(f256, f256, fZero); Vector128 <float> s128 = Sse2.ConvertToVector128Single(i128); Vector128 <float> ms = Sse.MultiplyScalar(s128, s128); // x86 / x64 SIMD命令一覧表(SSE~AVX2) //https://www.officedaytime.com/tips/simd.html // pmaddwd //https://www.officedaytime.com/tips/simdimg/si.php?f=pmaddwd Vector128 <short> sh128 = Sse41.ConvertToVector128Int16(p); Vector128 <int> vv3 = Avx.MultiplyAddAdjacent(sh128, sh128); var neko = 0; //Avx.MultiplyAddAdjacent; //Avx.MultiplyHigh; //Avx.MultiplyHighRoundScale; //Avx.MultiplyLow; //Avx.MultiplyScalar; //Fma.MultiplyAdd; //Fma.MultiplyAddNegated; //Fma.MultiplyAddNegatedScalar; //Fma.MultiplyAddScalar; //Fma.MultiplyAddSubtract; //Fma.MultiplySubtract; //Fma.MultiplySubtractAdd; //Fma.MultiplySubtractNegated; //Fma.MultiplySubtractNegatedScalar; //Fma.MultiplySubtractScalar; } }
//12倍速、やっぱりVectorのSqrtは速い private unsafe void Test6_MT(byte[] vs) { Parallel.ForEach(Partitioner.Create(0, ELEMENT_COUNT), range => { int simdLength = Vector256 <double> .Count; int lastIndex = range.Item2 - (range.Item2 - range.Item1) % simdLength; fixed(byte *p = vs) { for (int i = range.Item1; i < range.Item2; i += simdLength) { _ = Avx.Sqrt(Avx.ConvertToVector256Double(Sse41.ConvertToVector128Int32(p))); } } }); }
private static unsafe double[] BilinearInterpol_AVX( double[] x, double[] A, double minXA, double maxXA, double[] B, double minXB, double maxXB, double weightB) { double[] z = new double[outputVectorSize]; fixed(double *pX = &x[0], pA = &A[0], pB = &B[0], pZ = &z[0]) { Vector256 <double> vWeightB = Vector256.Create(weightB); Vector256 <double> vWeightA = Vector256.Create(1 - weightB); Vector256 <double> vMinXA = Vector256.Create(minXA); Vector256 <double> vMaxXA = Vector256.Create(maxXA); Vector256 <double> vMinXB = Vector256.Create(minXB); Vector256 <double> vMaxXB = Vector256.Create(maxXB); double deltaA = (maxXA - minXA) / (double)(A.Length - 1); double deltaB = (maxXB - minXB) / (double)(B.Length - 1); Vector256 <double> vDeltaA = Vector256.Create(deltaA); Vector256 <double> vDeltaB = Vector256.Create(deltaB); double invDeltaA = 1.0 / deltaA; double invDeltaB = 1.0 / deltaB; Vector256 <double> vInvDeltaA = Vector256.Create(invDeltaA); Vector256 <double> vInvDeltaB = Vector256.Create(invDeltaB); Vector128 <int> ALengthMinusOne = Vector128.Create(A.Length - 1); Vector128 <int> BLengthMinusOne = Vector128.Create(B.Length - 1); Vector128 <int> One = Vector128.Create(1); for (var i = 0; i < x.Length; i += Vector256 <double> .Count) { Vector256 <double> currentX = Avx.LoadVector256(pX + i); // Determine the largest a, such that A[i] = f(xA) and xA <= x[i]. // This involves casting from double to int; here we use a Vector conversion. Vector256 <double> aDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXA), vInvDeltaA); Vector128 <int> a = Avx.ConvertToVector128Int32WithTruncation(aDouble); a = Sse41.Min(Sse41.Max(a, Vector128 <int> .Zero), ALengthMinusOne); Vector128 <int> aPlusOne = Sse41.Min(Sse2.Add(a, One), ALengthMinusOne); // Now, get the reference input, xA, for our index a. // This involves casting from int to double. Vector256 <double> xA = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(a), vDeltaA), vMinXA); // Now, compute the lambda for our A reference point. Vector256 <double> currentXNormA = Avx.Max(vMinXA, Avx.Min(currentX, vMaxXA)); Vector256 <double> lambdaA = Avx.Multiply(Avx.Subtract(currentXNormA, xA), vInvDeltaA); // Now, we need to load up our reference points using Vector Gather operations. Vector256 <double> AVector = Avx2.GatherVector256(pA, a, 8); Vector256 <double> AVectorPlusOne = Avx2.GatherVector256(pA, aPlusOne, 8); // Now, do the all of the above for our B reference point. Vector256 <double> bDouble = Avx.Multiply(Avx.Subtract(currentX, vMinXB), vInvDeltaB); Vector128 <int> b = Avx.ConvertToVector128Int32WithTruncation(bDouble); b = Sse41.Min(Sse41.Max(b, Vector128 <int> .Zero), BLengthMinusOne); Vector128 <int> bPlusOne = Sse41.Min(Sse2.Add(b, One), BLengthMinusOne); Vector256 <double> xB = Avx.Add(Avx.Multiply(Avx.ConvertToVector256Double(b), vDeltaB), vMinXB); Vector256 <double> currentXNormB = Avx.Max(vMinXB, Avx.Min(currentX, vMaxXB)); Vector256 <double> lambdaB = Avx.Multiply(Avx.Subtract(currentXNormB, xB), vInvDeltaB); Vector256 <double> BVector = Avx2.GatherVector256(pB, b, 8); Vector256 <double> BVectorPlusOne = Avx2.GatherVector256(pB, bPlusOne, 8); Vector256 <double> newZ = Avx.Add(Avx.Multiply(vWeightA, Avx.Add(AVector, Avx.Multiply(lambdaA, Avx.Subtract(AVectorPlusOne, AVector)))), Avx.Multiply(vWeightB, Avx.Add(BVector, Avx.Multiply(lambdaB, Avx.Subtract(BVectorPlusOne, BVector))))); Avx.Store(pZ + i, newZ); } } return(z); }
public unsafe static Vector <double> BitmapToVector(Bitmap bitmap, BitmapChannel channel = BitmapChannel.Gray) { int width = bitmap.Width; int height = bitmap.Height; int pixelCount = width * height; bool needDispose = false; bool isGray = false; var rect = new Rectangle(0, 0, width, height); int depth = Bitmap.GetPixelFormatSize(bitmap.PixelFormat); switch (bitmap.PixelFormat) { case PixelFormat.Format24bppRgb: case PixelFormat.Format32bppArgb: case PixelFormat.Format32bppPArgb: case PixelFormat.Format32bppRgb: break; default: bitmap = channel == BitmapChannel.Gray ? MakeGrayscale(bitmap) : MakeColor(bitmap); needDispose = true; break; } if (!needDispose && channel == BitmapChannel.Gray) { bitmap = MakeGrayscale(bitmap); needDispose = true; isGray = true; } var result = new double[pixelCount]; var bitmapData = bitmap.LockBits(rect, ImageLockMode.ReadOnly, bitmap.PixelFormat); try { unsafe { byte *scan0 = (byte *)bitmapData.Scan0.ToPointer(); var ptr = bitmapData.Scan0; int startIndex; switch (depth) { case 8: // For 8 bpp get color value (Red, Green and Blue values are the same) if (channel == BitmapChannel.Alpha) { break; } for (int y = 0; y < bitmapData.Height; y++) { var rowB = (byte *)bitmapData.Scan0 + (y * bitmapData.Stride); startIndex = y * bitmapData.Width; if (bitmapData.Stride < 0) { startIndex = (pixelCount - bitmapData.Width) - startIndex; } for (int x = 0; x < bitmapData.Width; x++) { result[startIndex + x] = rowB[x]; } } PointwiseDivideInPlace(result, 256.0); break; case 16: // For 16 bpp - gray with 65536 shades if (channel == BitmapChannel.Alpha) { break; } for (int y = 0; y < bitmapData.Height; y++) { var rowS = (short *)bitmapData.Scan0 + (y * bitmapData.Stride); startIndex = y * bitmapData.Width; if (bitmapData.Stride < 0) { startIndex = (pixelCount - bitmapData.Width) - startIndex; } for (int x = 0; x < bitmapData.Width; x++) { result[startIndex + x] = rowS[x]; } } PointwiseDivideInPlace(result, 65536.0); break; case 24: // For 24 bpp get Red, Green and Blue case 32: // For 32 bpp get Red, Green, Blue and Alpha if (channel == BitmapChannel.Alpha && depth == 24) { break; } int step = depth / 8; if (channel == BitmapChannel.Gray) { if (isGray && UseGrayConverter) { for (int y = 0; y < bitmapData.Height; y++) { var row3B = (byte *)bitmapData.Scan0.ToPointer() + (y * bitmapData.Stride); startIndex = y * bitmapData.Width; if (bitmapData.Stride < 0) { startIndex = (pixelCount - bitmapData.Width) - startIndex; } for (int i = 0, x = 0; i < bitmapData.Width; i++, x += step) { result[startIndex + i] = row3B[x]; //In gray image (made with method MakeGray()) R = G = B. } } PointwiseDivideInPlace(result, 256.0); } else if (UseAvx) { var vectorGrayCoeffAvx = Vector256.Create(0.11d, 0.59d, 0.3d, 0d); for (int y = 0; y < bitmapData.Height; y++) { var row3B = (byte *)bitmapData.Scan0 + (y * bitmapData.Stride); startIndex = y * bitmapData.Width; if (bitmapData.Stride < 0) { startIndex = (pixelCount - bitmapData.Width) - startIndex; } for (int i = 0, x = 0; i < bitmapData.Width; i++, x += step) { var vectorB = Vector128.Create((int)row3B[x], (int)row3B[x + 1], (int)row3B[x + 2], (int)0); var vectorD = Avx.ConvertToVector256Double(vectorB); var vectorGray = Avx.Multiply(vectorD, vectorGrayCoeffAvx); double dGray = vectorGray.GetElement(0) + vectorGray.GetElement(1) + vectorGray.GetElement(2); result[startIndex + i] = dGray; } } PointwiseDivideInPlace(result, 256.0); } else if (UseSIMD) { var vectorGrayCoeff = new Numerics.Vector4(0.11f, 0.59f, 0.3f, 0f); for (int y = 0; y < bitmapData.Height; y++) { var row3B = (byte *)bitmapData.Scan0 + (y * bitmapData.Stride); startIndex = y * bitmapData.Width; if (bitmapData.Stride < 0) { startIndex = (pixelCount - bitmapData.Width) - startIndex; } for (int i = 0, x = 0; i < bitmapData.Width; i++, x += step) { var vectorF = new Numerics.Vector4(row3B[x], row3B[x + 1], row3B[x + 2], 0); var fGray = Numerics.Vector4.Dot(vectorF, vectorGrayCoeff); result[startIndex + i] = fGray; } } PointwiseDivideInPlace(result, 256.0); } else { for (int y = 0; y < bitmapData.Height; y++) { var row3B = (byte *)bitmapData.Scan0.ToPointer() + (y * bitmapData.Stride); startIndex = y * bitmapData.Width; if (bitmapData.Stride < 0) { startIndex = (pixelCount - bitmapData.Width) - startIndex; } for (int i = 0, x = 0; i < bitmapData.Width; i++, x += step) { double gray = 0.11d * row3B[x] + 0.59d * row3B[x + 1] + 0.11d * row3B[x + 2]; result[startIndex + i] = gray; } } PointwiseDivideInPlace(result, 256.0); } } else { for (int y = 0; y < bitmapData.Height; y++) { var row3B = (byte *)bitmapData.Scan0 + (y * bitmapData.Stride); startIndex = y * bitmapData.Width; if (bitmapData.Stride < 0) { startIndex = (pixelCount - bitmapData.Width) - startIndex; } switch (channel) { case BitmapChannel.Red: for (int i = 0, x = 0; i < bitmapData.Width; i++, x += step) { result[startIndex + i] = row3B[x + 2]; } break; case BitmapChannel.Green: for (int i = 0, x = 0; i < bitmapData.Width; i++, x += step) { result[startIndex + i] = row3B[x + 1]; } break; case BitmapChannel.Blue: for (int i = 0, x = 0; i < bitmapData.Width; i++, x += step) { result[startIndex + i] = row3B[x]; } break; case BitmapChannel.Alpha: if (depth == 32) { for (int i = 0, x = 0; i < bitmapData.Width; i++, x += step) { result[startIndex + i] = row3B[x + 3]; } } else { //Do nothing, 24bit images have no alpha channel } break; } } PointwiseDivideInPlace(result, 256.0); } break; } } } finally { bitmap.UnlockBits(bitmapData); if (needDispose) { bitmap.Dispose(); } } return(Vector <double> .Build.Dense(result)); }