示例#1
0
        //[/GenericScanDownsweepKernel]

        public T[] Apply(T[] input, bool inclusive)
        {
            var n         = input.Length;
            var numSm     = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT;
            var tup       = Plan.BlockRanges(numSm, n);
            var ranges    = tup.Item1;
            var numRanges = tup.Item2;


            var lpUpsweep   = new LaunchParam(numRanges, Plan.NumThreads);
            var lpReduce    = new LaunchParam(1, Plan.NumThreadsReduction);
            var lpDownsweep = new LaunchParam(numRanges, Plan.NumThreads);
            var _inclusive  = inclusive ? 1 : 0;

            using (var dRanges = GPUWorker.Malloc(ranges))
                using (var dRangeTotals = GPUWorker.Malloc <T>(numRanges + 1))
                    using (var dInput = GPUWorker.Malloc(input))
                        using (var dOutput = GPUWorker.Malloc(input))
                        {
                            _reduceModule.Upsweep(lpUpsweep, dInput.Ptr, dRanges.Ptr, dRangeTotals.Ptr);
                            GPULaunch(ScanReduce, lpReduce, numRanges, dRangeTotals.Ptr);
                            GPULaunch(Downsweep, lpDownsweep, dInput.Ptr, dOutput.Ptr, dRangeTotals.Ptr, dRanges.Ptr, _inclusive);
                            return(dOutput.Gather());
                        }
        }