private static void MatMul([Global] float[] A, [Global] float[] B, [Global] float[] C, [Shared] float[] As, [Shared] float[] Bs, uint wA, uint wB) { // Thread index uint tx = ThreadIdx.X; uint ty = ThreadIdx.Y; // Index of the first sub-matrix of A processed by the block uint aBegin = wA * BLOCK_SIZE * BlockIdx.Y; // Index of the last sub-matrix of A processed by the block uint aEnd = aBegin + wA - 1; // Step size used to iterate through the sub-matrices of A uint aStep = BLOCK_SIZE; // Index of the first sub-matrix of B processed by the block uint bBegin = BLOCK_SIZE * BlockIdx.X; // Step size used to iterate through the sub-matrices of B uint bStep = BLOCK_SIZE * wB; // Csub is used to store the element of the block sub-matrix // that is computed by the thread float Csub = 0; // Loop over all the sub-matrices of A and B // required to compute the block sub-matrix for (uint a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep) { // Load the matrices from device memory // to shared memory; each thread loads // one element of each matrix As[ty * BLOCK_SIZE + tx] = A[a + wA * ty + tx]; Bs[ty * BLOCK_SIZE + tx] = B[b + wB * ty + tx]; // Synchronize to make sure the matrices are loaded BuiltinFunctions.SyncThreads(); // Multiply the two matrices together; // each thread computes one element // of the block sub-matrix for (uint k = 0; k < BLOCK_SIZE; ++k) { Csub += As[ty * BLOCK_SIZE + k] * Bs[k * BLOCK_SIZE + tx]; } // Synchronize to make sure that the preceding // computation is done before loading two new // sub-matrices of A and B in the next iteration BuiltinFunctions.SyncThreads(); } // Write the block sub-matrix to device memory; // each thread writes one element uint c = wB * BLOCK_SIZE * BlockIdx.Y + BLOCK_SIZE * BlockIdx.X; C[c + wB * ty + tx] = Csub; }
private static void PoissonRBSOR_LMem([Global] float[] grid, [Global] float[] laplacian, int dimX, int dimY, int gstride, int lstride, float hx, float hy, float omega, int color, [Shared] float[] buf) { int threadIdxX = (int)ThreadIdx.X; int threadIdxY = (int)ThreadIdx.Y; int blockDimX = (int)BlockDim.X; int blockDimY = (int)BlockDim.Y; int blockIdxX = (int)BlockIdx.X; int blockIdxY = (int)BlockIdx.Y; int col_cnt = BuiltinFunctions.Min(AREA_SIZE_X + 2, dimX - blockIdxX * AREA_SIZE_X); int row_cnt = BuiltinFunctions.Min(AREA_SIZE_Y + 2, dimY - blockIdxY * AREA_SIZE_Y); for (int row = threadIdxY; row < row_cnt; row += blockDimY) { int x = threadIdxX + blockIdxX * AREA_SIZE_X; int y = row + blockIdxY * AREA_SIZE_Y; int index = x + y * gstride; for (int col = threadIdxX; col < col_cnt; col += blockDimX, index += blockDimX) { buf[IdxBuf(row, col)] = grid[index]; } } BuiltinFunctions.SyncThreads(); col_cnt -= 2; row_cnt -= 2; int col_start = 2 * threadIdxX; int col_delta = 2 * blockDimX; float b = 2 * hx * hy; float a1 = 2 * hy / hx; float a2 = 2 * hx / hy; float p = 0.5f * omega / (a1 + a2); float q = 1 - omega; for (int row = threadIdxY; row < row_cnt; row += blockDimY) { int col_offset = col_start + (color + row) % 2; int x = col_offset + blockIdxX * AREA_SIZE_X; int y = row + blockIdxY * AREA_SIZE_Y; int index = x + 1 + (y + 1) * gstride; for (int col = col_offset; col < col_cnt; col += col_delta, index += col_delta, x += col_delta) { grid[index] = (b * laplacian[x + y * lstride] + a1 * (buf[IdxBuf(row + 2, col + 1)] + buf[IdxBuf(row, col + 1)]) + a2 * (buf[IdxBuf(row + 1, col + 2)] + buf[IdxBuf(row + 1, col)])) * p + buf[IdxBuf(row + 1, col + 1)] * q; } } }
private static void PoissonJacobi([Global] float[] input, [Global] float[] output, [Shared] float[] buf, uint dimX, uint dimY, uint stride, float a1, float a2, float a3, float a4, float a, float hx, float hy, float x0, float y0) { uint col_cnt = BuiltinFunctions.Min(AREA_SIZE_X + 2, dimX - BlockIdx.X * AREA_SIZE_X); uint row_cnt = BuiltinFunctions.Min(AREA_SIZE_Y + 2, dimY - BlockIdx.Y * AREA_SIZE_Y); for (uint row = ThreadIdx.Y; row < row_cnt; row += BlockDim.Y) { uint x = ThreadIdx.X + BlockIdx.X * AREA_SIZE_X; uint y = row + BlockIdx.Y * AREA_SIZE_Y; uint idx = x + y * stride; for (uint col = ThreadIdx.X; col < col_cnt; col += BlockDim.X, idx += BlockDim.X) { buf[IdxBuf(row, col)] = input[idx]; } } BuiltinFunctions.SyncThreads(); col_cnt -= 2; row_cnt -= 2; for (uint row = ThreadIdx.Y; row < row_cnt; row += BlockDim.Y) { uint x = 1 + ThreadIdx.X + BlockIdx.X * AREA_SIZE_X; uint y = 1 + row + BlockIdx.Y * AREA_SIZE_Y; uint idx = x + y * stride; for (uint col = ThreadIdx.X; col < col_cnt; col += BlockDim.X, idx += BlockDim.X, x += BlockDim.X) { float F = 2 * hx * hy * J(x0 + x * hx, y0 + y * hy); output[idx] = (a1 * buf[IdxBuf(row + 2, col + 1)] + a2 * buf[IdxBuf(row + 1, col + 2)] + a3 * buf[IdxBuf(row, col + 1)] + a4 * buf[IdxBuf(row + 1, col)] + F) / a; } } }