public static void Total(FloatResidentArray a, int N, float[] total) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float sum = 0f; while (tid < N) { sum = sum + a[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = sum; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y); } }
public static void ScalarProd(int N, FloatResidentArray a, FloatResidentArray b, float[] result) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp += a[tid] * b[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicAdd(ref result[0], cache[0]); } }
private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp += r1[tid] * r2[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y); } }
public static float3 ComputeBodyAccel(float softeningSquared, float4 bodyPos, float4[] positions, int numTiles) { var sharedPos = new SharedMemoryAllocator <float4>().allocate(blockDim.x); var acc = new float3();// 0.0f, 0.0f, 0.0f); acc.x = 0.0F; acc.y = 0.0F; acc.z = 0.0F; for (var tile = 0; tile < numTiles; tile++) { sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x]; CUDAIntrinsics.__syncthreads(); // This is the "tile_calculation" from the GPUG3 article. for (var counter = 0; counter < blockDim.x; counter++) { acc = BodyBodyInteraction(softeningSquared, acc, bodyPos, sharedPos[counter]); } CUDAIntrinsics.__syncthreads(); } return(acc); }
public static void InnerReduce(float[] result, float[] input, int N, float neutral, Func <float, float, float> reductor) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = neutral; while (tid < N) { tmp = reductor(tmp, input[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = reductor(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], reductor); } }
public static void ReduceAdd(int N, int[] a, int[] result) { var cache = new SharedMemoryAllocator <int>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; int tmp = 0; while (tid < N) { tmp += a[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { Interlocked.Add(ref result[0], cache[0]); } }
public void Reduce(float[] result, float[] input, int N) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = reductor.neutral; while (tid < N) { tmp = reductor.func(tmp, input[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = reductor.func(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { result[blockIdx.x] = cache[0]; } }