示例#1
0
        private float[,] KernelMul(float[,] a, float[,] b)
        {
            const int defaultBlockSize = 16;
            var blockDim = new dim3(Math.Min(b.Width(), defaultBlockSize), Math.Min(a.Height(), defaultBlockSize));
            var gridDim = new dim3((int)Math.Ceiling(1.0 * b.Width() / blockDim.X), (int)Math.Ceiling(1.0 * a.Height() / blockDim.Y));

            var ptx = LoadPtxFromResources();
            // mess with driver caching JIT results
            // since we need to verify the entire compilation log
            ptx = ptx.Replace("exit;", "exit; // " + Guid.NewGuid());

            using (var jitted = ptx.JitKernel(blockDim))
            {
                // todo. that's an untidy way to invoke a kernel
                // since it doesn't dispose of invocation parameters
                // neither it is shielded against partial initialization issues

                var c = new float[a.Height(), b.Width()];
                var c_result = jitted.Invoke(gridDim, blockDim,
                    a.Width().In(), a.Height().In(), a.In(),
                    b.Width().In(), b.Height().In(), b.In(),
                    c.Width().In(), c.Height().In(), c.Out());
                return (float[,])c_result;
            }
        }
示例#2
0
        public KernelResult Launch(dim3 gridDim, dim3 blockDim)
        {
            CudaDriver.Ensure();
            Args.AssertNone(p => p.IsDisposed);
            _hasCompletedExecution.AssertFalse();

            var offsets = Args.Scanbe(0, (offset, arg, _) => offset + arg.SizeInArgList);
            Args.Zip(offsets, (arg, offset) => arg.PassInto(this, offset));

            try
            {
                nvcuda.cuFuncSetBlockShape(Function, blockDim);
                nvcuda.cuFuncSetSharedSize(Function, (uint)Function.SharedSizeBytes);
                nvcuda.cuFuncSetCacheConfig(Function, CUfunc_cache.PreferNone);
                nvcuda.cuParamSetSize(Function, (uint)Args.Select(p => p.SizeInArgList).Sum());

                TraceBeforeLaunch(gridDim, blockDim);
                var wall_time = CudaProfiler.Benchmark(() => nvcuda.cuLaunchGrid(Function, gridDim));
                Log.WriteLine("Function execution succeeded in {0} ({1} = 0.5 {2}s).", wall_time, Syms.Epsilon, Syms.Mu);
                return new KernelResult(this, wall_time);
            }
            finally 
            {
                _hasCompletedExecution = true;
            }
        }
示例#3
0
        //[/cuRANDComputeValue]

        //[cuRANDPiEstimator]
        public double RunEstimation(int numSims, int threadBlockSize)
        {
            // Aim to launch around ten or more times as many blocks as there
            // are multiprocessors on the target device.
            const int blocksPerSm = 10;
            var numSMs = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT;

            // Determine how to divide the work between cores
            var block = new dim3(threadBlockSize);
            var grid = new dim3((numSims + threadBlockSize - 1) / threadBlockSize);
            while (grid.x > 2 * blocksPerSm * numSims) grid.x >>= 1;

            var n = 2 * numSims;
            using (var dPoints = GPUWorker.Malloc<double>(n))
            using (var dResults = GPUWorker.Malloc<double>(grid.x))
            {
                // Generate random points in unit square
                var curand = new CURAND(GPUWorker, CURANDInterop.curandRngType.CURAND_RNG_QUASI_SOBOL64);
                curand.SetQuasiRandomGeneratorDimensions(2);
                curand.SetGeneratorOrdering(CURANDInterop.curandOrdering.CURAND_ORDERING_QUASI_DEFAULT);
                curand.GenerateUniformDouble(dPoints.Ptr, new IntPtr(n));

                var lp = new LaunchParam(grid, block, block.x * sizeof(uint));
                GPULaunch(ComputeValue, lp, dResults.Ptr, dPoints.Ptr, numSims);
                var value = dResults.Gather().Sum();
                return (value/numSims)*4.0;
            }
        }
示例#4
0
        internal GPGPUProperties(bool simulate = false, bool useAdvanced = true)
        {
            IsSimulated = simulate;
            Message = string.Empty;
            UseAdvanced = useAdvanced;
            MultiProcessorCount = 0;
            HighPerformanceDriver = false;
            SupportsDoublePrecision = true;
            AsynchEngineCount = 1;
            if (simulate)
            {
                Capability = new Version(0, 0);
                Name = "Simulator";
                DeviceId = 0;
                ulong freeMem = Int32.MaxValue;
                try
                {
                    PerformanceCounter pc = new PerformanceCounter("Memory", "Available Bytes");
                    freeMem = Convert.ToUInt64(pc.NextValue());
                }
                catch (Exception ex)
                {
                    Debug.WriteLine(ex.Message);
#if DEBUG
                    throw;
#endif 
                }
                TotalMemory = freeMem;
                MaxGridSize = new dim3(65536, 65536);
                MaxThreadsSize = new dim3(1024, 1024);
                MaxThreadsPerBlock = 1024;
            }
        }
示例#5
0
 public void Mult(int wA, int wB, int hC, deviceptr<double> A, deviceptr<double> B, deviceptr<double> C)
 {
     var block = new dim3(BlockSize, BlockSize);
     var grid = new dim3(wB/block.x, hC/block.y);
     var lp = new LaunchParam(grid, block);
     GPULaunch(Kernel, lp, wA, wB, A, B, C);
 }
示例#6
0
        public static JittedKernel JitKernel(this String ptx, dim3 reqntid, HardwareIsa target)
        {
            ptx.AssertNotNull();
            CudaDriver.Ensure();

            var tuning = new JitTuning { Reqntid = reqntid };
            return ptx.JitKernel(tuning, target);
        }
示例#7
0
        public static KernelResult Run(this JittedFunction function, dim3 gridDim, dim3 blockDim, params KernelArgument[] args)
        {
            function.AssertNotNull();
            args = args ?? Seq.Empty<KernelArgument>().ToArray();
            CudaDriver.Ensure();

            return function.Run(gridDim, blockDim, (IEnumerable<KernelArgument>)args);
        }
示例#8
0
文件: Tuning.cs 项目: xeno-by/libptx
 public Tuning()
 {
     Maxnreg = 0;
     Maxntid = new dim3(0, 0, 0);
     Reqntid = new dim3(0, 0, 0);
     Minnctapersm = 0;
     Maxnctapersm = 0;
 }
示例#9
0
        public static Object Invoke(this JittedKernel kernel, dim3 gridDim, dim3 blockDim, IEnumerable<KernelArgument> args)
        {
            kernel.AssertNotNull();
            args = args ?? Seq.Empty<KernelArgument>().ToArray();
            CudaDriver.Ensure();

            return kernel.Function.Invoke(gridDim, blockDim, args);
        }
示例#10
0
        public static KernelResult Run(this CUfunction function, dim3 gridDim, dim3 blockDim, IEnumerable<KernelArgument> args)
        {
            function.IsNotNull.AssertTrue();
            args = args ?? Seq.Empty<KernelArgument>().ToArray();
            CudaDriver.Ensure();

            return new JittedFunction(function).Run(gridDim, blockDim, args);
        }
示例#11
0
        public static KernelResult Run(this JittedKernel kernel, dim3 gridDim, dim3 blockDim, params KernelArgument[] args)
        {
            kernel.AssertNotNull();
            args = args ?? Seq.Empty<KernelArgument>().ToArray();
            CudaDriver.Ensure();

            return kernel.Function.Run(gridDim, blockDim, args);
        }
示例#12
0
        public static KernelResult Run(this JittedFunction function, dim3 gridDim, dim3 blockDim, IEnumerable<KernelArgument> args)
        {
            function.AssertNotNull();
            args = args ?? Seq.Empty<KernelArgument>().ToArray();
            CudaDriver.Ensure();

            var invocation = new KernelInvocation(function, args);
            return invocation.Launch(gridDim, blockDim);
        }
示例#13
0
 public KernelThreadException(IKernel kernel, dim3? gridDim, int3? blockIdx, dim3? blockDim, int3? threadIdx, String workerThread, Exception innerException)
     : base(null, innerException)
 {
     Kernel = kernel.AssertNotNull();
     GridDim = gridDim;
     BlockIdx = blockIdx;
     BlockDim = blockDim;
     ThreadIdx = threadIdx;
     WorkerThread = workerThread.AssertNotNull();
 }
        public static void TestSimpleMultiply()
        {
            for (var iter = 1; iter <= 3; ++iter)
            {
                Console.WriteLine("====> Test SimpleMultiply with Alea GPU C# AOT instance usage (#.{0}) <====", iter);

                var timer = Stopwatch.StartNew();
                var worker = Util.Worker;
                Console.WriteLine("GPU: {0}", worker.Device.Name);
                timer.Stop();
                Console.WriteLine("Step 1) Runtime setup                   {0} ms", timer.Elapsed.TotalMilliseconds);

                timer.Restart();
                using (var module = new InstanceUsageAOT(GPUModuleTarget.Worker(worker)))
                {
                    module.GPUForceLoad();
                    timer.Stop();
                    Console.WriteLine("Step 2+3) Compile and Load module       {0} ms", timer.Elapsed.TotalMilliseconds);

                    const int factor = 8;
                    var a = Util.RandomMatrix(100 * factor, 200 * factor);
                    var b = Util.RandomMatrix(200 * factor, 300 * factor);
                    var aRows = 100 * factor;
                    var bCols = 300 * factor;
                    var aCols_bRows = 200 * factor;
                    var gridDim = new dim3(Util.Divup(bCols, TileSize), Util.Divup(aRows, TileSize));
                    var blockDim = new dim3(TileSize, TileSize);
                    var lp = new LaunchParam(gridDim, blockDim);

                    using (var devA = worker.Malloc(a))
                    using (var devB = worker.Malloc(b))
                    using (var devC = worker.Malloc<float>(aRows * bCols))
                    {
                        timer.Restart();
                        module.GPULaunch(module.SimpleMultiplyKernel, lp, devA.Ptr, devB.Ptr, devC.Ptr, aRows, bCols, aCols_bRows);
                        worker.Synchronize();
                        timer.Stop();
                        Console.WriteLine("Kernel launch first time                {0} ms", timer.Elapsed.TotalMilliseconds);

                        const int repetitions = 50;
                        timer.Restart();
                        for (var i = 0; i < repetitions; ++i)
                        {
                            module.GPULaunch(module.SimpleMultiplyKernel, lp, devA.Ptr, devB.Ptr, devC.Ptr, aRows, bCols, aCols_bRows);
                        }
                        worker.Synchronize();
                        timer.Stop();
                        Console.WriteLine("Kernel launch average time              {0} ms", (timer.Elapsed.TotalMilliseconds / (float)repetitions));

                        var c = devC.Gather();
                        Util.VerifyResult(a, b, c, aRows, bCols, aCols_bRows);
                    }
                }
            }
        }
示例#15
0
        internal GridCaps(CudaDevice device)
        {
            Device = device;

            MaxThreadsInBlock = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxThreadsPerBlock, device);

            var maxBlockX = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxBlockDimX, device);
            var maxBlockY = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxBlockDimY, device);
            var maxBlockZ = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxBlockDimZ, device);
            MaxBlockDim = new dim3(maxBlockX, maxBlockY, maxBlockZ);

            var maxGridX = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxGridDimX, device);
            var maxGridY = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxGridDimY, device);
            var maxGridZ = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxGridDimZ, device);
            MaxGridDim = new dim3(maxGridX, maxGridY, maxGridZ);
        }
示例#16
0
        private void TraceBeforeLaunch(dim3 gridDim, dim3 blockDim)
        {
            Log.EnsureBlankLine();
            Log.WriteLine("Launching function {0}...", Function);
            Log.WriteLine("Grid is configured as {2}: blockdim is {0}, griddim is {1}",
                blockDim.ToString().Slice(4),
                gridDim.ToString().Slice(4),
                new dim3(blockDim.X * gridDim.X, blockDim.Y * gridDim.Y, blockDim.Z * gridDim.Z).ToString().Slice(4));

            var log = new List<Object>();
            var offset = 0;
            var sizeInVRAM = 0;
            foreach (var args in Args)
            {
                var value = args.Get("_value");
                log.Add(Tuple.Create("+" + offset.ToString("0000"), args.Direction, value, args.SizeInVRAM, args.SizeInArgList));
                offset += args.SizeInArgList;
                sizeInVRAM += args.SizeInVRAM;
            }
            log.Add(Tuple.Create(offset.ToString("0000"), "", "", sizeInVRAM, offset));

            var formatted = log.Select(t => t.TupleItems().Select(v => v.ToString()).ToArray()).ToArray();
            Func<int, String, int, String> pad = (i, text, max) => i < 2 ? text.PadRight(max) : i == 2 ? text.PadRight((int)(max + 1)) : text;
            var padded = formatted.Select(entry => entry.Select((part, i) => pad(i, part, formatted.Max(entry1 => entry1[i].Length)))).ToArray();

            var maxLength = 0;
            padded.ForEach((entry, i) =>
            {
                if (i == log.Count() - 1 && maxLength != 0)
                {
                    Log.WriteLine("    " + "*".Repeat(maxLength - 4));
                }

                var line = String.Format("    {0} {1} {2} ({3} bytes in VRAM)", entry.SkipLast(1).ToArray());
                Log.WriteLine(line);
                maxLength = Math.Max(line.Length, maxLength);
            });
        }
示例#17
0
        void advectParticles(uint vbo, CudaPitchedDeviceVariable<cData> v, int dx, int dy, float dt, SizeT tPitch)
        {
            dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1);

            dim3 tids = new dim3(TIDSX, TIDSY, 1);

            cuda_vbo_resource.MapAllResources();
            CUdeviceptr p = cuda_vbo_resource[0].GetMappedPointer();
            advectParticles_k.GridDimensions = grid;
            advectParticles_k.BlockDimensions = tids;
            advectParticles_k.Run(p, v.DevicePointer, dx, dy, dt, TILEY / TIDSY, tPitch);
            cuda_vbo_resource.UnmapAllResources();
        }
示例#18
0
        void advectVelocity(CudaPitchedDeviceVariable<cData> v, CudaDeviceVariable<cData> vx, CudaDeviceVariable<cData> vy, int dx, int pdx, int dy, float dt, SizeT tPitch)
        {
            dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1);

            dim3 tids = new dim3(TIDSX, TIDSY, 1);

            updateTexture(v, DIM * float2.SizeOf, DIM, tPitch);

            advectVelocity_k.GridDimensions = grid;
            advectVelocity_k.BlockDimensions = tids;
            advectVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, dt, TILEY / TIDSY);
        }
示例#19
0
        public void Run(dim3 gridDim, dim3 blockDim, Object kernel_instance)
        {
            // todo. due to a bug in Libcuda/nvcuda we cannot specify compilation Target
            // when we use default target (TargetFromContext) everything works fine tho
            // so let's stick to this solution for now
            (Cfg.Target == CudaVersions.HardwareIsa).AssertTrue();
            using (var jitted_ptx = Ptx.JitKernel(blockDim))
//            using (var jitted_ptx = Ptx.JitKernel(blockDim, Cfg.Target))
            {
                using (var kernel_args = _memcpyHostToDevice(kernel_instance))
                {
                    var kernel_result = jitted_ptx.Run(gridDim, blockDim, kernel_args);
                    _memcpyDeviceToHost(kernel_result, kernel_instance);
                }
            }
        }
示例#20
0
        /// <summary>
        ///     Вызов и исполнение одной элементарной функции по имени функции
        /// </summary>
        /// <param name="function"></param>
        public static void Execute(IEnumerable <string> functions, int input, int output)
        {
            Debug.Assert(input != 0 && output != 0);

            CudafyModule km = CudafyTranslator.Cudafy();

            GPGPU gpu = CudafyHost.GetDevice();

            gpu.LoadModule(km);

            int[,] devA = gpu.Allocate(_a);
            int[,] devB = gpu.Allocate(_b);
            int[] devC = gpu.Allocate(_c);
            int[] devD = gpu.Allocate(_d);
            int[] devE = gpu.Allocate(E);

            if ((input & (int)Register.A) != 0)
            {
                gpu.CopyToDevice(_a, devA);
            }
            if ((input & (int)Register.B) != 0)
            {
                gpu.CopyToDevice(_b, devB);
            }
            if ((input & (int)Register.C) != 0)
            {
                gpu.CopyToDevice(_c, devC);
            }
            if ((input & (int)Register.D) != 0)
            {
                gpu.CopyToDevice(_d, devD);
            }
            if ((input & (int)Register.E) != 0)
            {
                gpu.CopyToDevice(E, devE);
            }

            int rows    = _a.GetLength(0);
            int columns = _a.GetLength(1);

            dim3 gridSize  = Math.Min(15, (int)Math.Pow(rows * columns, 0.33333333333));
            dim3 blockSize = Math.Min(15, (int)Math.Pow(rows * columns, 0.33333333333));

            foreach (string function in functions)
            {
                gpu.Launch(gridSize, blockSize, function, devA, devB, devC, devD, devE);
            }

            if ((output & (int)Register.A) != 0)
            {
                gpu.CopyFromDevice(devA, _a);
            }
            if ((output & (int)Register.B) != 0)
            {
                gpu.CopyFromDevice(devB, _b);
            }
            if ((output & (int)Register.C) != 0)
            {
                gpu.CopyFromDevice(devC, _c);
            }
            if ((output & (int)Register.D) != 0)
            {
                gpu.CopyFromDevice(devD, _d);
            }
            if ((output & (int)Register.E) != 0)
            {
                gpu.CopyFromDevice(devE, E);
            }

            // free the memory allocated on the GPU
            gpu.FreeAll();
        }
示例#21
0
 public virtual void SetupExecution(dim3 blockDimensions, dim3 gridDimensions)
 {
     m_kernel.BlockDimensions = blockDimensions;
     m_kernel.GridDimensions  = gridDimensions;
 }
示例#22
0
        private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth)
        {
            int count = width * height * depth;
            int widthD = width - 1;
            int heightD = height - 1;
            int depthD = depth - 1;
            int countDecremented = widthD * heightD * depthD;

            dim3 blockDimensions = new dim3(8, 8, 8);
            dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0));
            dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0));

            CUDANoiseCube noiseCube = new CUDANoiseCube();

            CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16);
            CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray);

            CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count);

            kernelPositionWeight.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions);

            kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth);

            kernelNormalAmbient.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions);

            kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount);

            int nearestW = NearestPowerOfTwo(widthD);
            int nearestH = NearestPowerOfTwo(heightD);
            int nearestD = NearestPowerOfTwo(depthD);
            int nearestCount = nearestW * nearestH * nearestD;

            CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount);
            trisCountDevice.Memset(0);
            CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented);

            kernelMarchingCubesCases.BlockDimensions = blockDimensions;
            typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented);

            kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD);

            CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount);

            int lastTrisCount = 0;
            trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int));

            int lastPrefixSum = 0;
            prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int));

            int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3;

            if (totalVerticesCount > 0)
            {
                if (container.Geometry != null)
                    container.Geometry.Dispose();

                container.VertexCount = totalVerticesCount;

                container.Geometry = new Buffer(graphicsDevice, new BufferDescription()
                {
                    BindFlags = BindFlags.VertexBuffer,
                    CpuAccessFlags = CpuAccessFlags.None,
                    OptionFlags = ResourceOptionFlags.None,
                    SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount,
                    Usage = ResourceUsage.Default
                });

                CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None);
                
                kernelMarchingCubesVertices.BlockDimensions = blockDimensions;
                typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented);

                directResource.Map();
                kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD);
                directResource.UnMap();

                directResource.Dispose();
            }
            else
            {
                container.VertexCount = 0;

                if (container.Geometry != null)
                    container.Geometry.Dispose();
            }

            noiseCube.Dispose();
            prefixSumsDev.Dispose();
            trisCountDevice.Dispose();
            offsetsDev.Dispose();
            noiseArray.Dispose();
            noiseTexture.Dispose();
            voxelsDev.Dispose();
        }
示例#23
0
        static void Main(string[] args)
        {
            const int nx = 2048;
            const int ny = 2048;

            // shifts applied to x and y data
            const int x_shift = 5;
            const int y_shift = 7;

            ShrQATest.shrQAStart(args);

            if ((nx%TILE_DIM != 0)  || (ny%TILE_DIM != 0))
            {
                Console.Write("nx and ny must be multiples of TILE_DIM\n");
                ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_WAIVED);
            }

            // execution configuration parameters
            dim3 grid = new dim3(nx/TILE_DIM, ny/TILE_DIM, 1);
            dim3 threads = new dim3(TILE_DIM, TILE_DIM, 1);

            // This will pick the best possible CUDA capable device
            int devID = findCudaDevice(args);

            //Load Kernel image from resources
            string resName;
            if (IntPtr.Size == 8)
                resName = "simplePitchLinearTexture_x64.ptx";
            else
                resName = "simplePitchLinearTexture.ptx";

            string resNamespace = "simplePitchLinearTexture";
            string resource = resNamespace + "." + resName;
            Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource);
            if (stream == null) throw new ArgumentException("Kernel not found in resources.");
            byte[] kernels = new byte[stream.Length];

            int bytesToRead = (int)stream.Length;
            while (bytesToRead > 0)
            {
                bytesToRead -= stream.Read(kernels, (int)stream.Position, bytesToRead);
            }

            CudaKernel PLKernel = ctx.LoadKernelPTX(kernels, "shiftPitchLinear");
            CudaKernel ArrayKernel = ctx.LoadKernelPTX(kernels, "shiftArray");

            CudaStopWatch stopwatch = new CudaStopWatch();

            // ----------------------------------
            // Host allocation and initialization
            // ----------------------------------

            float[] h_idata = new float[nx * ny];
            float[] h_odata = new float[nx * ny];
            float[] gold = new float[nx * ny];

            for (int i = 0; i < nx * ny; ++i) h_idata[i] = (float)i;

            // ------------------------
            // Device memory allocation
            // ------------------------

            // Pitch linear input data
            CudaPitchedDeviceVariable<float> d_idataPL = new CudaPitchedDeviceVariable<float>(nx, ny);

            // Array input data
            CudaArray2D d_idataArray = new CudaArray2D(CUArrayFormat.Float, nx, ny, CudaArray2DNumChannels.One);

            // Pitch linear output data
            CudaPitchedDeviceVariable<float> d_odata = new CudaPitchedDeviceVariable<float>(nx, ny);

            // ------------------------
            // copy host data to device
            // ------------------------

            // Pitch linear
            d_idataPL.CopyToDevice(h_idata);

            // Array
            d_idataArray.CopyFromHostToThis<float>(h_idata);

            // ----------------------
            // Bind texture to memory
            // ----------------------

            // Pitch linear
            CudaTextureLinearPitched2D<float> texRefPL = new CudaTextureLinearPitched2D<float>(PLKernel, "texRefPL", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, CUArrayFormat.Float, d_idataPL);
            CudaTextureArray2D texRefArray = new CudaTextureArray2D(ArrayKernel, "texRefArray", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, d_idataArray);

            // ---------------------
            // reference calculation
            // ---------------------

            for (int j = 0; j < ny; j++)
            {
                int jshift = (j + y_shift) % ny;
                for (int i = 0; i < nx; i++)
                {
                    int ishift = (i + x_shift) % nx;
                    gold[j * nx + i] = h_idata[jshift * nx + ishift];
                }
            }

            // ----------------
            // shiftPitchLinear
            // ----------------

            ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes);
            PLKernel.BlockDimensions = threads;
            PLKernel.GridDimensions = grid;
            stopwatch.Start();
            for (int i=0; i < NUM_REPS; i++)
            {
                PLKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch/sizeof(float)), nx, ny, x_shift, y_shift);
            }
            stopwatch.Stop();
            stopwatch.StopEvent.Synchronize();
            float timePL = stopwatch.GetElapsedTime();

            // check results
            d_odata.CopyToHost(h_odata);

            bool res = cutComparef(gold, h_odata);

            bool success = true;
            if (res == false) {
                Console.Write("*** shiftPitchLinear failed ***\n");
                success = false;
            }

            // ----------
            // shiftArray
            // ----------

            ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes);
            ArrayKernel.BlockDimensions = threads;
            ArrayKernel.GridDimensions = grid;
            stopwatch.Start();
            for (int i=0; i < NUM_REPS; i++) {
                ArrayKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch/sizeof(float)), nx, ny, x_shift, y_shift);

            }

            stopwatch.Stop();
            stopwatch.StopEvent.Synchronize();
            float timeArray = stopwatch.GetElapsedTime();

            // check results
            d_odata.CopyToHost(h_odata);

            res = cutComparef(gold, h_odata);

            if (res == false) {
                Console.Write("*** shiftArray failed ***\n");
                success = false;
            }

            float bandwidthPL = 2.0f*1000.0f*nx*ny*sizeof(float)/(1e+9f)/(timePL/NUM_REPS);
            float bandwidthArray = 2.0f*1000.0f*nx*ny*sizeof(float)/(1e+9f)/(timeArray/NUM_REPS);
            Console.Write("\nBandwidth (GB/s) for pitch linear: {0}; for array: {1}\n",
                bandwidthPL, bandwidthArray);

            float fetchRatePL = nx*ny/1e+6f/(timePL/(1000.0f*NUM_REPS));
            float fetchRateArray = nx*ny/1e+6f/(timeArray/(1000.0f*NUM_REPS));
            Console.Write("\nTexture fetch rate (Mpix/s) for pitch linear: {0}; for array: {1}\n\n",
                fetchRatePL, fetchRateArray);

            // cleanup
            texRefPL.Dispose();
            texRefArray.Dispose();
            d_idataPL.Dispose();
            d_idataArray.Dispose();
            d_odata.Dispose();
            stopwatch.Dispose();
            ctx.Dispose();

            ShrQATest.shrQAFinishExit(args, (success == true) ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED);
        }
示例#24
0
        void addForces(CudaPitchedDeviceVariable<float2> v, int dx, int dy, int spx, int spy, float fx, float fy, int r, SizeT tPitch)
        {
            dim3 tids = new dim3((uint)(2 * r + 1), (uint)(2 * r + 1), 1);

            addForces_k.GridDimensions = new dim3(1);
            addForces_k.BlockDimensions = tids;
            addForces_k.Run(v.DevicePointer, dx, dy, spx, spy, fx, fy, r, tPitch);
        }
示例#25
0
        private void generateKernels(string forwardName, string backName, string clrName, string activeName, dim3 kernelSize)
        {
            forward = ctx.LoadKernel("kernel.ptx", forwardName);
            forward.GridDimensions  = new dim3(size.x, size.y, size.z);
            forward.BlockDimensions = kernelSize;

            back = ctx.LoadKernel("kernel.ptx", backName);
            back.GridDimensions  = new dim3(size.x, size.y, size.z);
            back.BlockDimensions = kernelSize;

            clear = ctx.LoadKernel("kernel.ptx", activeName);
            clear.GridDimensions = new dim3(size.x, size.y, size.z);

            activate = ctx.LoadKernel("kernel.ptx", activeName);
            activate.GridDimensions = new dim3(size.x, size.y, size.z);
        }
示例#26
0
文件: IGrid.cs 项目: xeno-by/conflux
 public Grid(dim3 gridDim, dim3 blockDim)
 {
     GridDim = gridDim;
     BlockDim = blockDim;
 }
示例#27
0
        public float3[] GetPointsGPU(int NumPoints)
        {
            int BlockSize = 512;

            if (NumPoints % BlockSize != 0)
            {
                throw new Exception("NumPoints must be divisible by " + BlockSize.ToString());
            }
            int[] TriangleCounts = new int[GridCount + 1];
            var   Maxima         = new float3[GridCount];
            var   Minima         = new float3[GridCount];

            TriangleCounts[0] = 0;
            for (int i = 0; i < GridCount; i++)
            {
                int LocalCount = TriangleCounts[i] + (int)Domains[i].TriangleCount;
                if (Domains[i].TriangleCount > BlockSize)
                {
                    throw new Exception("STL File must have no more than " + BlockSize.ToString() + " Triangles");
                }
                TriangleCounts[i + 1] = LocalCount;
                Minima[i]             = STLReader.ToFloat3(Domains[i].Extrema.Min);
                Maxima[i]             = STLReader.ToFloat3(Domains[i].Extrema.Max);
            }
            var Triangles = new TriangleSTL[TriangleCounts[GridCount]];
            int id        = 0;

            for (int i = 0; i < GridCount; i++)
            {
                for (int j = 0; j < TriangleCounts[i]; j++)
                {
                    var LocalTri = Domains[i].Triangles[j];
                    Triangles[id] = new TriangleSTL(LocalTri);
                    id++;
                }
            }

            var ctx              = new CudaContext(1);
            var DeviceInfo       = ctx.GetDeviceInfo();
            var d_Triangles      = new CudaDeviceVariable <TriangleSTL>(Triangles.Length);
            var d_TriangleCounts = new CudaDeviceVariable <int>(GridCount);
            var d_Minima         = new CudaDeviceVariable <float3>(GridCount);
            var d_Maxima         = new CudaDeviceVariable <float3>(GridCount);
            var d_Points         = new CudaDeviceVariable <float3>(GridCount * NumPoints);
            var h_Points         = new float3[GridCount * NumPoints];
            var rng              = new Random(0); // use a sequence that is repeatable over and over again

            for (int i = 0; i < GridCount * NumPoints; i++)
            {
                h_Points[i].x = (float)rng.NextDouble();
                h_Points[i].y = (float)rng.NextDouble();
                h_Points[i].z = (float)rng.NextDouble();
            }
            int ctr = 0;

            for (int i = 0; i < GridCount; i++)
            {
                for (int j = 0; j < NumPoints; j++)
                {
                    h_Points[ctr].x = Minima[i].x + h_Points[ctr].x * (Maxima[i].x - Minima[i].x);
                    h_Points[ctr].y = Minima[i].y + h_Points[ctr].y * (Maxima[i].y - Minima[i].y);
                    h_Points[ctr].z = Minima[i].z + h_Points[ctr].z * (Maxima[i].z - Minima[i].z);
                    ctr++;
                }
            }
            d_Points         = h_Points;
            d_Triangles      = Triangles;
            d_TriangleCounts = TriangleCounts;
            d_Minima         = Minima;
            d_Maxima         = Maxima;
            // copy over to host
            // TODO generate grid on GPU instead of CPU


            var PointInPolygonKernel = ctx.LoadKernelPTX("PointInPolygon.ptx", "PointInPolygon");
            var BlockDim             = new dim3(BlockSize, 1, 1);
            var GridDim = new dim3(GridCount, 1, 1);

            PointInPolygonKernel.BlockDimensions = BlockDim;
            PointInPolygonKernel.GridDimensions  = GridDim;

            PointInPolygonKernel.Run(GridCount,
                                     NumPoints,
                                     d_TriangleCounts.DevicePointer,
                                     d_Triangles.DevicePointer,
                                     d_Maxima.DevicePointer,
                                     d_Minima.DevicePointer,
                                     d_Points.DevicePointer);
            h_Points = d_Points;

            return(h_Points); // TODO Fix this to remove bad points
        }
 private CudaKernel InitializeGridsAndThreads(string kernalName, Matrix a)
 {
     int maxThreads = Math.Min(this.maxThreadPerBlockDim, a.Row);
     dim3 threads = a.Column == 1 ? new dim3(maxThreads, 1) : new dim3(maxThreads, maxThreads);
     dim3 blocks = new dim3((a.Row + maxThreads - 1) / maxThreads, (a.Column + maxThreads - 1) / maxThreads);
     CudaKernel kernel = new CudaKernel(kernalName, this.cuModule, this.cudaContext)
     {
         GridDimensions = blocks,
         BlockDimensions = threads
     };
     return kernel;
 }
        /// <summary>
        /// Invokes the specified context.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="cudaContext">The cuda context.</param>
        /// <param name="kernelName">Name of the kernel.</param>
        /// <param name="grid">The grid.</param>
        /// <param name="block">The block.</param>
        /// <param name="smemSize">Size of the smem.</param>
        /// <param name="stream">The stream.</param>
        /// <param name="args">The arguments.</param>
        private void Invoke(TSCudaContext context, CudaContext cudaContext, string kernelName, dim3 grid, dim3 block, uint smemSize, CUstream stream, params object[] args)
        {
            var ptx    = GetPtx(context.Compiler);
            var kernel = context.KernelCache.Get(cudaContext, ptx, kernelName);

            kernel.GridDimensions      = grid;
            kernel.BlockDimensions     = block;
            kernel.DynamicSharedMemory = smemSize;
            kernel.RunAsync(stream, args);
        }
示例#30
0
        void diffuseProject(CudaDeviceVariable<cData> vx, CudaDeviceVariable<cData> vy, int dx, int dy, float dt, float visc, SizeT tPitch)
        {
            // Forward FFT
            planr2c.Exec(vx.DevicePointer);
            planr2c.Exec(vy.DevicePointer);

            dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1);

            dim3 tids = new dim3(TIDSX, TIDSY, 1);

            diffuseProject_k.GridDimensions = grid;
            diffuseProject_k.BlockDimensions = tids;
            diffuseProject_k.Run(vx.DevicePointer, vy.DevicePointer, dx, dy, dt, visc, TILEY / TIDSY);

            // Inverse FFT
            planc2r.Exec(vx.DevicePointer);
            planc2r.Exec(vy.DevicePointer);
        }
示例#31
0
 /// <summary>
 ///  NOTE: Compute Capability 3.5 and later only. Dynamic parallelism. Call from a single thread.
 ///  Not supported by emulator.
 /// </summary>
 /// <param name="gridSize">Size of grid.</param>
 /// <param name="blockSize">Size of block.</param>
 /// <param name="functionName">Name of function to launch.</param>
 /// <param name="args">Arguments.</param>
 public static int Launch(this GThread thread, dim3 gridSize, dim3 blockSize, string functionName, params object[] args)
 {
     ThrowNotSupported();
     return 0;
 }
示例#32
0
        void updateVelocity(CudaPitchedDeviceVariable<cData> v, CudaDeviceVariable<cData> vx, CudaDeviceVariable<cData> vy, int dx, int pdx, int dy, SizeT tPitch)
        {
            dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1);

            dim3 tids = new dim3(TIDSX, TIDSY, 1);

            updateVelocity_k.GridDimensions = grid;
            updateVelocity_k.BlockDimensions = tids;
            updateVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, TILEY / TIDSY, tPitch);
        }
示例#33
0
        public static void InvokeReduce(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string kernelName, dim3 grid, dim3 block, uint smemSize, ApplySpecialization spec, params object[] args)
        {
            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            var kernel = context.KernelCache.Get(cudaContext, ptx, kernelName);

            kernel.GridDimensions      = grid;
            kernel.BlockDimensions     = block;
            kernel.DynamicSharedMemory = smemSize;

            kernel.Run(args);
        }
示例#34
0
 public static extern CUResult cuMemcpyDtoH_v2(ref dim3 dstHost, CUdeviceptr srcDevice, SizeT ByteCount);
示例#35
0
 public void SetDims(dim3 gridDim, dim3 blockDim)
 {
     GridDim = gridDim;
     BlockDim = blockDim;
 }
示例#36
0
        public static TestingStructure[] CorrectColour(System.Drawing.Color rgb, double X, double Y, double Z)
        {
            //rgb = System.Drawing.Color.FromArgb(69, 77, 217);
            //X = 0.0630982813175294;
            //Y = 0.616476271122916;
            //Z = 0.667048468232457;

            //cuda intializer
            CudafyModule km = CudafyModule.TryDeserialize();

            if (km == null || !km.TryVerifyChecksums())
            {
                // km = CudafyTranslator.Cudafy((typeof(ForeGroundStrucuture)), (typeof(BackGroundStrucuture)), typeof(Color));
                km = CudafyTranslator.Cudafy((typeof(ProfileStrucuture)), (typeof(ForeGroundStrucuture)), (typeof(BackGroundStrucuture)), (typeof(SampleStructure)), (typeof(TestingStructure)), typeof(snake));
                km.TrySerialize();
            }

            CudafyTranslator.GenerateDebug = true;
            // cuda or emulator
            GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId);

            //GPGPU gpu = CudafyHost.GetDevice(eGPUType.Emulator);
            Console.WriteLine("Running snake correction using {0}", gpu.GetDeviceProperties(false).Name);
            gpu.LoadModule(km);

            TestingStructure[] distance_CPU = new TestingStructure[1];

            // allocate memory on the GPU for the bitmap (same size as ptr)
            #region
            DataTable profile = new DataTable();
            try
            {
                // add the csv bin file
                using (GenericParserAdapter parser = new GenericParserAdapter(@"C:\lev\STColorCorrection\Data\PROFILE\p3700.csv"))
                {
                    System.Data.DataSet dsResult = parser.GetDataSet();
                    profile = dsResult.Tables[0];
                }
            }
            catch (Exception ex)
            { Console.WriteLine(ex); }
            #endregion

            // allocate temp memory, initialize it, copy to constant memory on the GPU
            // L 0-21 A 0-41 B 0-45

            ProfileStrucuture[, ,] profiles_CPU = new ProfileStrucuture[21, 41, 45];
            ForeGroundStrucuture[] foregorungRGB_CPU = new ForeGroundStrucuture[1];
            BackGroundStrucuture[] BackgroundXYZ_CPU = new BackGroundStrucuture[1];
            SampleStructure[,] samples_CPU = new SampleStructure[1, 6];

            //profile inicialization
            #region
            for (int indexL = 0; indexL < 21; indexL++)
            {
                for (int indexA = 0; indexA < 41; indexA++)
                {
                    for (int indexB = 0; indexB < 45; indexB++)
                    {
                        profiles_CPU[indexL, indexA, indexB].L = indexL;
                        profiles_CPU[indexL, indexA, indexB].A = indexA;
                        profiles_CPU[indexL, indexA, indexB].B = indexB;
                        //profiles_CPU[indexL, indexA, indexB].Given_R = 0;
                        //profiles_CPU[indexL, indexA, indexB].Given_G = 0;
                        //profiles_CPU[indexL, indexA, indexB].Given_B = 0;
                        profiles_CPU[indexL, indexA, indexB].ML       = 0;
                        profiles_CPU[indexL, indexA, indexB].MA       = 0;
                        profiles_CPU[indexL, indexA, indexB].MB       = 0;
                        profiles_CPU[indexL, indexA, indexB].MX       = 0;
                        profiles_CPU[indexL, indexA, indexB].MY       = 0;
                        profiles_CPU[indexL, indexA, indexB].MZ       = 0;
                        profiles_CPU[indexL, indexA, indexB].distance = -1.0;
                        profiles_CPU[indexL, indexA, indexB].weight   = -1.0;

                        profiles_CPU[indexL, indexA, indexB].isempty = TRUE;
                        profiles_CPU[indexL, indexA, indexB].isMoreAccurateThanOrigin = FALSE;
                    }
                }
            }



            int lvalue, avalue, bvalue;
            try
            {
                for (int i = 1; i < profile.Rows.Count; i++)
                {
                    lvalue = Convert.ToInt32(profile.Rows[i][0].ToString());
                    avalue = Convert.ToInt32(profile.Rows[i][1].ToString());
                    bvalue = Convert.ToInt32(profile.Rows[i][2].ToString());

                    lvalue = (int)(lvalue * 0.2);
                    avalue = (int)(avalue * 0.2) + 20;
                    bvalue = (int)(bvalue * 0.2) + 22;

                    profiles_CPU[lvalue, avalue, bvalue].L = lvalue;
                    profiles_CPU[lvalue, avalue, bvalue].A = avalue;
                    profiles_CPU[lvalue, avalue, bvalue].B = bvalue;

                    //profiles_CPU[lvalue, avalue, bvalue].Given_R = (double)Convert.ToByte(profile.Rows[i][9].ToString());
                    //profiles_CPU[lvalue, avalue, bvalue].Given_G = (double)Convert.ToByte(profile.Rows[i][10].ToString());
                    //profiles_CPU[lvalue, avalue, bvalue].Given_B = (double)Convert.ToByte(profile.Rows[i][11].ToString());

                    profiles_CPU[lvalue, avalue, bvalue].ML = (double)Convert.ToDouble(profile.Rows[i][3].ToString());
                    profiles_CPU[lvalue, avalue, bvalue].MA = (double)Convert.ToDouble(profile.Rows[i][4].ToString());
                    profiles_CPU[lvalue, avalue, bvalue].MB = (double)Convert.ToDouble(profile.Rows[i][5].ToString());

                    profiles_CPU[lvalue, avalue, bvalue].MX = (double)Convert.ToDouble(profile.Rows[i][6].ToString());
                    profiles_CPU[lvalue, avalue, bvalue].MY = (double)Convert.ToDouble(profile.Rows[i][7].ToString());
                    profiles_CPU[lvalue, avalue, bvalue].MZ = (double)Convert.ToDouble(profile.Rows[i][8].ToString());


                    profiles_CPU[lvalue, avalue, bvalue].isempty = FALSE;
                }
            }
            catch (Exception ex)
            { Console.WriteLine(ex); }
            #endregion

            //grab the colors
            ProfileStrucuture[, ,] profile_GPU = gpu.CopyToDevice(profiles_CPU);
            SampleStructure[,] samples_GPU     = gpu.CopyToDevice(samples_CPU);

            Point3D background = new Point3D(X, Y, Z);

            //foreground and background image inicialization
            #region
            try
            {
                for (int i = 0; i < 1; i++)
                {
                    foregorungRGB_CPU[i].R = rgb.R;
                    foregorungRGB_CPU[i].G = rgb.G;
                    foregorungRGB_CPU[i].B = rgb.B;

                    BackgroundXYZ_CPU[i].X = background.X;
                    BackgroundXYZ_CPU[i].Y = background.Y;
                    BackgroundXYZ_CPU[i].Z = background.Z;
                }
            }
            catch (Exception ex)
            { Console.WriteLine(ex); }
            #endregion

            //begin execution
            // capture the start time
            gpu.StartTimer();
            ForeGroundStrucuture[] foregorungRGB_GPU = gpu.CopyToDevice(foregorungRGB_CPU);
            BackGroundStrucuture[] BackgroundXYZ_GPU = gpu.CopyToDevice(BackgroundXYZ_CPU);

            //out put
            TestingStructure[] distance_GPU = gpu.Allocate(distance_CPU);

            // generate a bitmap from our sphere data
            //Image size: 1024 x 768

            //dim3 grids = new dim3(1024 / 16, 768 / 16);
            //dim3 threads = new dim3(16, 16);

            dim3 grids   = new dim3(1, 1);
            dim3 threads = new dim3(1, 1);

            //quick_correct
            //gpu.Launch(grids, threads, ((Action<GThread, ProfileStrucuture[, ,], ForeGroundStrucuture[], BackGroundStrucuture[], ProfileStrucuture[], SampleStructure[,]>)QuickCorr), profile_GPU, foregorungRGB_GPU, BackgroundXYZ_GPU, distance_GPU, samples_GPU);

            //quick correct - testing
            gpu.Launch(grids, threads, ((Action <GThread, ProfileStrucuture[, , ], ForeGroundStrucuture[], BackGroundStrucuture[], TestingStructure[], SampleStructure[, ]>)QuickCorr), profile_GPU, foregorungRGB_GPU, BackgroundXYZ_GPU, distance_GPU, samples_GPU);


            // copy our bitmap back from the GPU for display
            gpu.CopyFromDevice(distance_GPU, distance_CPU);


            // get stop time, and display the timing results
            double elapsedTime = gpu.StopTimer();
            distance_CPU[0].execution_time = elapsedTime;
            Console.WriteLine("Time to generate: {0} ms", elapsedTime);
            gpu.Free(foregorungRGB_GPU);
            gpu.Free(BackgroundXYZ_GPU);
            gpu.Free(distance_GPU);
            gpu.FreeAll();

            return(distance_CPU);
        }
示例#37
0
 public KernelResult Run(dim3 gridDim, dim3 blockDim, IEnumerable<KernelArgument> args)
 {
     var invocation = new KernelInvocation(this, args);
     return invocation.Launch(gridDim, blockDim);
 }