private static void AleaOptimisedImpl( Gpu gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <deviceptr <Real>, deviceptr <Real>, Constant <int>, Constant <int>, int, int> kernel) { using var cudaSquaredDistance = gpu.AllocateDevice <Real>(n, n); using var cudaCoordinates = gpu.AllocateDevice(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 256; var gridSize = Util.DivUp(n, blockSize); var lp = new LaunchParam(new dim3(gridSize, gridSize, 1), new dim3(blockSize, 1, 1)); var pitch = cudaSquaredDistance.PitchInElements.ToInt32(); gpu.Launch(kernel, lp, cudaSquaredDistance.Ptr, cudaCoordinates.Ptr, Gpu.Constant(blockSize), Gpu.Constant(c), n, pitch); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); Gpu.Copy2D(cudaSquaredDistance, mSquaredDistances, n, n); }
private static void CudaOptimisedImpl <TInt>( Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <deviceptr <float>, deviceptr <float>, TInt, int, int> kernel, Func <int, TInt> numCoordGetter) { var gpu = Gpu.Default; using (var cudaSquaredDistance = gpu.AllocateDevice <Real>(n, n)) using (var cudaCoordinates = gpu.AllocateDevice(mCoordinates)) { var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n, blockSize); var lp = new LaunchParam(new dim3(gridSize, gridSize, 1), new dim3(blockSize, 1, 1), 2 * c * blockSize * sizeof(Real)); var pitch = cudaSquaredDistance.PitchInElements.ToInt32(); gpu.Launch(kernel, lp, cudaSquaredDistance.Ptr, cudaCoordinates.Ptr, numCoordGetter(c), n, pitch); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); Gpu.Copy2D(cudaSquaredDistance, mSquaredDistances, n, n); } }