//[/GenericScanDownsweepKernel] public T[] Apply(T[] input, bool inclusive) { var n = input.Length; var numSm = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT; var tup = Plan.BlockRanges(numSm, n); var ranges = tup.Item1; var numRanges = tup.Item2; var lpUpsweep = new LaunchParam(numRanges, Plan.NumThreads); var lpReduce = new LaunchParam(1, Plan.NumThreadsReduction); var lpDownsweep = new LaunchParam(numRanges, Plan.NumThreads); var _inclusive = inclusive ? 1 : 0; using (var dRanges = GPUWorker.Malloc(ranges)) using (var dRangeTotals = GPUWorker.Malloc <T>(numRanges + 1)) using (var dInput = GPUWorker.Malloc(input)) using (var dOutput = GPUWorker.Malloc(input)) { _reduceModule.Upsweep(lpUpsweep, dInput.Ptr, dRanges.Ptr, dRangeTotals.Ptr); GPULaunch(ScanReduce, lpReduce, numRanges, dRangeTotals.Ptr); GPULaunch(Downsweep, lpDownsweep, dInput.Ptr, dOutput.Ptr, dRangeTotals.Ptr, dRanges.Ptr, _inclusive); return(dOutput.Gather()); } }