public static void Total(FloatResidentArray a, int N, float[] total)
        {
            var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x);

            int   tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int   cacheIndex = threadIdx.x;
            float sum        = 0f;

            while (tid < N)
            {
                sum  = sum + a[tid];
                tid += blockDim.x * gridDim.x;
            }
            cache[cacheIndex] = sum;
            CUDAIntrinsics.__syncthreads();
            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i];
                }
                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y);
            }
        }
Exemple #2
0
        public static void Multiply(NaiveMatrix result, NaiveMatrix A, NaiveMatrix B, int size)
        {
            SharedMemoryAllocator <float> allocator = new SharedMemoryAllocator <float>();

            float[] cacheA = allocator.allocate(blockDim.y * blockDim.x);
            float[] cacheB = allocator.allocate(blockDim.y * blockDim.x);

            for (int by = blockIdx.y; by < size / blockDim.y; by += gridDim.y)
            {
                for (int bx = blockIdx.x; bx < size / blockDim.x; bx += gridDim.x)
                {
                    int tx = threadIdx.x, ty = threadIdx.y;

                    int i = by * blockDim.y + ty;
                    int j = bx * blockDim.x + tx;

                    if (i >= size || j >= size)
                    {
                        return;
                    }

                    float Pvalue = 0;
                    for (int blockIdread = 0; blockIdread < size / blockDim.x; ++blockIdread)
                    {
                        cacheA[ty * blockDim.y + tx] = A[i * size + (blockIdread * blockDim.x + tx)];
                        cacheB[ty * blockDim.y + tx] = B[(blockIdread * blockDim.x + ty) * size + j];

                        SyncThreads();

                        for (int k = 0; k < blockDim.x; ++k)
                        {
                            Pvalue += cacheA[ty * blockDim.x + k] * cacheB[k * blockDim.x + tx];
                        }

                        SyncThreads();
                    }

                    result[i * size + j] = Pvalue;
                }
            }
        }
Exemple #3
0
        public static void ScalarProd(int N, FloatResidentArray a, FloatResidentArray b, float[] result)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = 0.0F;

            while (tid < N)
            {
                tmp += a[tid] * b[tid];
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicAdd(ref result[0], cache[0]);
            }
        }
Exemple #4
0
        private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = 0.0F;

            while (tid < N)
            {
                tmp += r1[tid] * r2[tid];
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y);
            }
        }
        public static float3 ComputeBodyAccel(float softeningSquared, float4 bodyPos, float4[] positions, int numTiles)
        {
            var sharedPos = new SharedMemoryAllocator <float4>().allocate(blockDim.x);
            var acc       = new float3();// 0.0f, 0.0f, 0.0f);

            acc.x = 0.0F; acc.y = 0.0F; acc.z = 0.0F;

            for (var tile = 0; tile < numTiles; tile++)
            {
                sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x];

                CUDAIntrinsics.__syncthreads();

                // This is the "tile_calculation" from the GPUG3 article.
                for (var counter = 0; counter < blockDim.x; counter++)
                {
                    acc = BodyBodyInteraction(softeningSquared, acc, bodyPos, sharedPos[counter]);
                }

                CUDAIntrinsics.__syncthreads();
            }

            return(acc);
        }
Exemple #6
0
        public static void reduceSinglePassMultiBlockCG(float[] g_idata, float[] g_odata, uint n)
        {
            // Handle to thread block group
            thread_block block = cg.this_thread_block();
            grid_group   grid  = cg.this_grid();

            double[] sdata = new SharedMemoryAllocator <double>().allocate(blockDim.x);

            // Stride over grid and add the values to a shared memory buffer
            sdata[block.thread_rank()] = 0;

            for (uint i = grid.thread_rank(); i < n; i += grid.size())
            {
                sdata[block.thread_rank()] += g_idata[i];
            }

            block.sync();

            // Reduce each block (called once per block)
            reduceBlock(sdata, block);
            // Write out the result to global memory
            if (block.thread_rank() == 0)
            {
                g_odata[blockIdx.x] = (float)sdata[0];
            }

            grid.sync();

            if (grid.thread_rank() == 0)
            {
                for (uint blockId = 1; blockId < gridDim.x; blockId++)
                {
                    g_odata[0] += g_odata[blockId];
                }
            }
        }
Exemple #7
0
        public static void InnerReduce(float[] result, float[] input, int N, float neutral, Func <float, float, float> reductor)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = neutral;

            while (tid < N)
            {
                tmp  = reductor(tmp, input[tid]);
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] = reductor(cache[cacheIndex], cache[cacheIndex + i]);
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicExpr.apply(ref result[0], cache[0], reductor);
            }
        }
Exemple #8
0
        public static void ReduceAdd(int N, int[] a, int[] result)
        {
            var cache      = new SharedMemoryAllocator <int>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            int tmp = 0;

            while (tid < N)
            {
                tmp += a[tid];
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                Interlocked.Add(ref result[0], cache[0]);
            }
        }
Exemple #9
0
        public void Reduce(float[] result, float[] input, int N)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = reductor.neutral;

            while (tid < N)
            {
                tmp  = reductor.func(tmp, input[tid]);
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] = reductor.func(cache[cacheIndex], cache[cacheIndex + i]);
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                result[blockIdx.x] = cache[0];
            }
        }