Esempio n. 1
0
        private static void MatMul([Global] float[] A, [Global] float[] B, [Global] float[] C,
                                   [Shared] float[] As, [Shared] float[] Bs, uint wA, uint wB)
        {
            // Thread index
            uint tx = ThreadIdx.X;
            uint ty = ThreadIdx.Y;

            // Index of the first sub-matrix of A processed by the block
            uint aBegin = wA * BLOCK_SIZE * BlockIdx.Y;

            // Index of the last sub-matrix of A processed by the block
            uint aEnd = aBegin + wA - 1;

            // Step size used to iterate through the sub-matrices of A
            uint aStep = BLOCK_SIZE;

            // Index of the first sub-matrix of B processed by the block
            uint bBegin = BLOCK_SIZE * BlockIdx.X;

            // Step size used to iterate through the sub-matrices of B
            uint bStep = BLOCK_SIZE * wB;

            // Csub is used to store the element of the block sub-matrix
            // that is computed by the thread
            float Csub = 0;

            // Loop over all the sub-matrices of A and B
            // required to compute the block sub-matrix
            for (uint a = aBegin, b = bBegin; a <= aEnd; a += aStep, b += bStep)
            {
                // Load the matrices from device memory
                // to shared memory; each thread loads
                // one element of each matrix
                As[ty * BLOCK_SIZE + tx] = A[a + wA * ty + tx];
                Bs[ty * BLOCK_SIZE + tx] = B[b + wB * ty + tx];

                // Synchronize to make sure the matrices are loaded
                BuiltinFunctions.SyncThreads();

                // Multiply the two matrices together;
                // each thread computes one element
                // of the block sub-matrix
                for (uint k = 0; k < BLOCK_SIZE; ++k)
                {
                    Csub += As[ty * BLOCK_SIZE + k] * Bs[k * BLOCK_SIZE + tx];
                }

                // Synchronize to make sure that the preceding
                // computation is done before loading two new
                // sub-matrices of A and B in the next iteration
                BuiltinFunctions.SyncThreads();
            }

            // Write the block sub-matrix to device memory;
            // each thread writes one element
            uint c = wB * BLOCK_SIZE * BlockIdx.Y + BLOCK_SIZE * BlockIdx.X;

            C[c + wB * ty + tx] = Csub;
        }
Esempio n. 2
0
        private static void PoissonRBSOR_LMem([Global] float[] grid, [Global] float[] laplacian,
                                              int dimX, int dimY, int gstride, int lstride,
                                              float hx, float hy, float omega, int color,
                                              [Shared] float[] buf)
        {
            int threadIdxX = (int)ThreadIdx.X;
            int threadIdxY = (int)ThreadIdx.Y;
            int blockDimX  = (int)BlockDim.X;
            int blockDimY  = (int)BlockDim.Y;
            int blockIdxX  = (int)BlockIdx.X;
            int blockIdxY  = (int)BlockIdx.Y;

            int col_cnt = BuiltinFunctions.Min(AREA_SIZE_X + 2, dimX - blockIdxX * AREA_SIZE_X);
            int row_cnt = BuiltinFunctions.Min(AREA_SIZE_Y + 2, dimY - blockIdxY * AREA_SIZE_Y);

            for (int row = threadIdxY; row < row_cnt; row += blockDimY)
            {
                int x     = threadIdxX + blockIdxX * AREA_SIZE_X;
                int y     = row + blockIdxY * AREA_SIZE_Y;
                int index = x + y * gstride;
                for (int col = threadIdxX; col < col_cnt; col += blockDimX, index += blockDimX)
                {
                    buf[IdxBuf(row, col)] = grid[index];
                }
            }

            BuiltinFunctions.SyncThreads();

            col_cnt -= 2;
            row_cnt -= 2;

            int col_start = 2 * threadIdxX;
            int col_delta = 2 * blockDimX;

            float b  = 2 * hx * hy;
            float a1 = 2 * hy / hx;
            float a2 = 2 * hx / hy;
            float p  = 0.5f * omega / (a1 + a2);
            float q  = 1 - omega;

            for (int row = threadIdxY; row < row_cnt; row += blockDimY)
            {
                int col_offset = col_start + (color + row) % 2;
                int x          = col_offset + blockIdxX * AREA_SIZE_X;
                int y          = row + blockIdxY * AREA_SIZE_Y;
                int index      = x + 1 + (y + 1) * gstride;

                for (int col = col_offset; col < col_cnt; col += col_delta, index += col_delta, x += col_delta)
                {
                    grid[index] = (b * laplacian[x + y * lstride] +
                                   a1 * (buf[IdxBuf(row + 2, col + 1)] + buf[IdxBuf(row, col + 1)]) +
                                   a2 * (buf[IdxBuf(row + 1, col + 2)] + buf[IdxBuf(row + 1, col)])) * p +
                                  buf[IdxBuf(row + 1, col + 1)] * q;
                }
            }
        }
Esempio n. 3
0
        private static void PoissonJacobi([Global] float[] input, [Global] float[] output, [Shared] float[] buf,
                                          uint dimX, uint dimY, uint stride,
                                          float a1, float a2, float a3, float a4, float a,
                                          float hx, float hy, float x0, float y0)
        {
            uint col_cnt = BuiltinFunctions.Min(AREA_SIZE_X + 2, dimX - BlockIdx.X * AREA_SIZE_X);
            uint row_cnt = BuiltinFunctions.Min(AREA_SIZE_Y + 2, dimY - BlockIdx.Y * AREA_SIZE_Y);

            for (uint row = ThreadIdx.Y; row < row_cnt; row += BlockDim.Y)
            {
                uint x   = ThreadIdx.X + BlockIdx.X * AREA_SIZE_X;
                uint y   = row + BlockIdx.Y * AREA_SIZE_Y;
                uint idx = x + y * stride;
                for (uint col = ThreadIdx.X; col < col_cnt; col += BlockDim.X, idx += BlockDim.X)
                {
                    buf[IdxBuf(row, col)] = input[idx];
                }
            }

            BuiltinFunctions.SyncThreads();

            col_cnt -= 2;
            row_cnt -= 2;

            for (uint row = ThreadIdx.Y; row < row_cnt; row += BlockDim.Y)
            {
                uint x   = 1 + ThreadIdx.X + BlockIdx.X * AREA_SIZE_X;
                uint y   = 1 + row + BlockIdx.Y * AREA_SIZE_Y;
                uint idx = x + y * stride;
                for (uint col = ThreadIdx.X; col < col_cnt; col += BlockDim.X, idx += BlockDim.X, x += BlockDim.X)
                {
                    float F = 2 * hx * hy * J(x0 + x * hx, y0 + y * hy);
                    output[idx] = (a1 * buf[IdxBuf(row + 2, col + 1)] + a2 * buf[IdxBuf(row + 1, col + 2)] +
                                   a3 * buf[IdxBuf(row, col + 1)] + a4 * buf[IdxBuf(row + 1, col)] + F) / a;
                }
            }
        }