// ReSharper disable once SuggestBaseTypeForParameter private static void KernelSequentialReduceIdleThreadsWarp <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op) { var shared = __shared__.ExternArray <T>(); var tid = threadIdx.x; var bid = blockIdx.x; var bdm = blockDim.x; var gid = 2 * bdm * bid + tid; shared[tid] = (gid < length && gid + bdm < length) ? op(array[gid], array[gid + bdm]) : array[gid]; DeviceFunction.SyncThreads(); for (var s = bdm / 2; s > WarpSize; s >>= 1) { if (tid < s && gid + s < length) { shared[tid] = op(shared[tid], shared[tid + s]); } DeviceFunction.SyncThreads(); } if (tid < WarpSize) { if (bdm >= 2 * WarpSize) { shared[tid] = op(shared[tid], shared[tid + WarpSize]); } shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 16)); shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 8)); shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 4)); shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 2)); shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 1)); } if (tid == 0) { result[bid] = shared[0]; } }
// ReSharper disable once SuggestBaseTypeForParameter private static void KernelSequentialReduceIdleThreadsWarpMultiple <T>(deviceptr <T> array, int length, deviceptr <T> result, Func <T, T, T> op) { var tid = threadIdx.x; var bid = blockIdx.x; var bdm = blockDim.x; var gid = bdm * bid + tid; // Todo: 'default(T)' is a bad idea, think of (n * 0) => The accumulator's initial value should be provided by the user! var accumulator = default(T); while (gid < length) { accumulator = op(accumulator, array[gid]); gid += gridDim.x * bdm; } accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 16)); accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 8)); accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 4)); accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 2)); accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 1)); var shared = __shared__.Array <T>(8); if (tid % WarpSize == 0) { shared[tid / WarpSize] = accumulator; } DeviceFunction.SyncThreads(); if (tid == 0) { var a = op(op(shared[0], shared[1]), op(shared[2], shared[3])); var b = op(op(shared[4], shared[5]), op(shared[6], shared[7])); result[bid] = op(a, b); } }