public static void Total(FloatResidentArray a, int N, float[] total) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float sum = 0f; while (tid < N) { sum = sum + a[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = sum; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y); } }
public static void Multiply(NaiveMatrix result, NaiveMatrix A, NaiveMatrix B, int size) { SharedMemoryAllocator <float> allocator = new SharedMemoryAllocator <float>(); float[] cacheA = allocator.allocate(blockDim.y * blockDim.x); float[] cacheB = allocator.allocate(blockDim.y * blockDim.x); for (int by = blockIdx.y; by < size / blockDim.y; by += gridDim.y) { for (int bx = blockIdx.x; bx < size / blockDim.x; bx += gridDim.x) { int tx = threadIdx.x, ty = threadIdx.y; int i = by * blockDim.y + ty; int j = bx * blockDim.x + tx; if (i >= size || j >= size) { return; } float Pvalue = 0; for (int blockIdread = 0; blockIdread < size / blockDim.x; ++blockIdread) { cacheA[ty * blockDim.y + tx] = A[i * size + (blockIdread * blockDim.x + tx)]; cacheB[ty * blockDim.y + tx] = B[(blockIdread * blockDim.x + ty) * size + j]; SyncThreads(); for (int k = 0; k < blockDim.x; ++k) { Pvalue += cacheA[ty * blockDim.x + k] * cacheB[k * blockDim.x + tx]; } SyncThreads(); } result[i * size + j] = Pvalue; } } }
public static void ScalarProd(int N, FloatResidentArray a, FloatResidentArray b, float[] result) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp += a[tid] * b[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicAdd(ref result[0], cache[0]); } }
private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = 0.0F; while (tid < N) { tmp += r1[tid] * r2[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y); } }
public static float3 ComputeBodyAccel(float softeningSquared, float4 bodyPos, float4[] positions, int numTiles) { var sharedPos = new SharedMemoryAllocator <float4>().allocate(blockDim.x); var acc = new float3();// 0.0f, 0.0f, 0.0f); acc.x = 0.0F; acc.y = 0.0F; acc.z = 0.0F; for (var tile = 0; tile < numTiles; tile++) { sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x]; CUDAIntrinsics.__syncthreads(); // This is the "tile_calculation" from the GPUG3 article. for (var counter = 0; counter < blockDim.x; counter++) { acc = BodyBodyInteraction(softeningSquared, acc, bodyPos, sharedPos[counter]); } CUDAIntrinsics.__syncthreads(); } return(acc); }
public static void reduceSinglePassMultiBlockCG(float[] g_idata, float[] g_odata, uint n) { // Handle to thread block group thread_block block = cg.this_thread_block(); grid_group grid = cg.this_grid(); double[] sdata = new SharedMemoryAllocator <double>().allocate(blockDim.x); // Stride over grid and add the values to a shared memory buffer sdata[block.thread_rank()] = 0; for (uint i = grid.thread_rank(); i < n; i += grid.size()) { sdata[block.thread_rank()] += g_idata[i]; } block.sync(); // Reduce each block (called once per block) reduceBlock(sdata, block); // Write out the result to global memory if (block.thread_rank() == 0) { g_odata[blockIdx.x] = (float)sdata[0]; } grid.sync(); if (grid.thread_rank() == 0) { for (uint blockId = 1; blockId < gridDim.x; blockId++) { g_odata[0] += g_odata[blockId]; } } }
public static void InnerReduce(float[] result, float[] input, int N, float neutral, Func <float, float, float> reductor) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = neutral; while (tid < N) { tmp = reductor(tmp, input[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = reductor(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { AtomicExpr.apply(ref result[0], cache[0], reductor); } }
public static void ReduceAdd(int N, int[] a, int[] result) { var cache = new SharedMemoryAllocator <int>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; int tmp = 0; while (tid < N) { tmp += a[tid]; tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] += cache[cacheIndex + i]; } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { Interlocked.Add(ref result[0], cache[0]); } }
public void Reduce(float[] result, float[] input, int N) { var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x); int tid = threadIdx.x + blockDim.x * blockIdx.x; int cacheIndex = threadIdx.x; float tmp = reductor.neutral; while (tid < N) { tmp = reductor.func(tmp, input[tid]); tid += blockDim.x * gridDim.x; } cache[cacheIndex] = tmp; CUDAIntrinsics.__syncthreads(); int i = blockDim.x / 2; while (i != 0) { if (cacheIndex < i) { cache[cacheIndex] = reductor.func(cache[cacheIndex], cache[cacheIndex + i]); } CUDAIntrinsics.__syncthreads(); i >>= 1; } if (cacheIndex == 0) { result[blockIdx.x] = cache[0]; } }