private static void AleaKernelConstants( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, Constant <int> c, int n, int pitch) { // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant. // Also, we write the results as float2. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c.Value * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c.Value; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); var line = threadIdx.x / (blockDim.x / 2); var tid = threadIdx.x % (blockDim.x / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>(); for (int i = line; i < blockDim.x && bI + i < n; i += 2) { var dist = default(Real2); for (int k = 0; k != c.Value; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist.x += diff.x * diff.x; dist.y += diff.y * diff.y; } var dst = mSquaredDistances.Ptr((bI + i) * pitch + bJ).Reinterpret <Real2>(); dst[tid] = dist; } } }
private static void AleaKernelFloat2( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, int c, int n, int pitch) { // Same as KernelSharedMemory, but one thread does two element in one by using float2 reads. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); var line = threadIdx.x / (blockDim.x / 2); var tid = threadIdx.x % (blockDim.x / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>(); for (int i = line; i < blockDim.x && bI + i < n; i += 2) { Real dist0 = 0; Real dist1 = 0; for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist0 += diff.x * diff.x; dist1 += diff.y * diff.y; } mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 0)] = dist0; mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 1)] = dist1; } } }
private static void AleaKernelLocalMemory( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, Constant <int> dimX, Constant <int> c, int n, int pitch) { // Same as KernelConstants, but use both local and shared memory to increase the effective shared memory. var coordinatesI = __shared__.Array <Real>(c.Value * dimX.Value); var coordinatesJ = __local__.Array <Real2>(c.Value); var bI = blockIdx.y * dimX.Value; var bJ = blockIdx.x * dimX.Value; var line = threadIdx.x / (dimX.Value / 2); var tid = threadIdx.x % (dimX.Value / 2); var isActive = bJ + tid * 2 < n; for (int k = 0; k != c.Value; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * dimX.Value + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (isActive) { var mCoordinates2 = mCoordinates.Reinterpret <Real2>(); coordinatesJ[k] = mCoordinates2[(k * n + bJ) / 2 + tid]; } } DeviceFunction.SyncThreads(); if (isActive) { for (int i = line; i < dimX.Value && bI + i < n; i += 2) { var dist = default(Real2); for (int k = 0; k != c.Value; ++k) { var coord1 = coordinatesI[k * dimX.Value + i]; var coord2 = coordinatesJ[k]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist.x += diff.x * diff.x; dist.y += diff.y * diff.y; } var dst = mSquaredDistances.Reinterpret <Real2>(); dst[((bI + i) * pitch + bJ) / 2 + tid] = dist; } } }