public static void Total(FloatResidentArray a, int N, float[] total)
        {
            var cache = new SharedMemoryAllocator <float>().allocate(blockDim.x);

            int   tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int   cacheIndex = threadIdx.x;
            float sum        = 0f;

            while (tid < N)
            {
                sum  = sum + a[tid];
                tid += blockDim.x * gridDim.x;
            }
            cache[cacheIndex] = sum;
            CUDAIntrinsics.__syncthreads();
            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] = cache[cacheIndex] + cache[cacheIndex + i];
                }
                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicExpr.apply(ref total[0], cache[0], (x, y) => x + y);
            }
        }
Esempio n. 2
0
        public static void ScalarProd(int N, FloatResidentArray a, FloatResidentArray b, float[] result)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = 0.0F;

            while (tid < N)
            {
                tmp += a[tid] * b[tid];
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicAdd(ref result[0], cache[0]);
            }
        }
Esempio n. 3
0
        private static void ScalarProd(float[] result, FloatResidentArray r1, FloatResidentArray r2, int N)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = 0.0F;

            while (tid < N)
            {
                tmp += r1[tid] * r2[tid];
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicExpr.apply(ref result[0], cache[0], (x, y) => x + y);
            }
        }
        public static float3 ComputeBodyAccel(float softeningSquared, float4 bodyPos, float4[] positions, int numTiles)
        {
            var sharedPos = new SharedMemoryAllocator <float4>().allocate(blockDim.x);
            var acc       = new float3();// 0.0f, 0.0f, 0.0f);

            acc.x = 0.0F; acc.y = 0.0F; acc.z = 0.0F;

            for (var tile = 0; tile < numTiles; tile++)
            {
                sharedPos[threadIdx.x] = positions[tile * blockDim.x + threadIdx.x];

                CUDAIntrinsics.__syncthreads();

                // This is the "tile_calculation" from the GPUG3 article.
                for (var counter = 0; counter < blockDim.x; counter++)
                {
                    acc = BodyBodyInteraction(softeningSquared, acc, bodyPos, sharedPos[counter]);
                }

                CUDAIntrinsics.__syncthreads();
            }

            return(acc);
        }
Esempio n. 5
0
        public static void InnerReduce(float[] result, float[] input, int N, float neutral, Func <float, float, float> reductor)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = neutral;

            while (tid < N)
            {
                tmp  = reductor(tmp, input[tid]);
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] = reductor(cache[cacheIndex], cache[cacheIndex + i]);
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                AtomicExpr.apply(ref result[0], cache[0], reductor);
            }
        }
Esempio n. 6
0
        public static void ReduceAdd(int N, int[] a, int[] result)
        {
            var cache      = new SharedMemoryAllocator <int>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            int tmp = 0;

            while (tid < N)
            {
                tmp += a[tid];
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] += cache[cacheIndex + i];
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                Interlocked.Add(ref result[0], cache[0]);
            }
        }
Esempio n. 7
0
        public void Reduce(float[] result, float[] input, int N)
        {
            var cache      = new SharedMemoryAllocator <float>().allocate(blockDim.x);
            int tid        = threadIdx.x + blockDim.x * blockIdx.x;
            int cacheIndex = threadIdx.x;

            float tmp = reductor.neutral;

            while (tid < N)
            {
                tmp  = reductor.func(tmp, input[tid]);
                tid += blockDim.x * gridDim.x;
            }

            cache[cacheIndex] = tmp;

            CUDAIntrinsics.__syncthreads();

            int i = blockDim.x / 2;

            while (i != 0)
            {
                if (cacheIndex < i)
                {
                    cache[cacheIndex] = reductor.func(cache[cacheIndex], cache[cacheIndex + i]);
                }

                CUDAIntrinsics.__syncthreads();
                i >>= 1;
            }

            if (cacheIndex == 0)
            {
                result[blockIdx.x] = cache[0];
            }
        }