static void Main(string[] args) { // Create a new instance of CUDA class, select 1st device. CUDA cuda = new CUDA(0, true); // Prepare parameters. int n = 16 * 1024 * 1024; uint nbytes = (uint)(n * sizeof(int)); int value = 26; // allocate host memory int[] a = new int[n]; // allocate device memory CUdeviceptr d_a = cuda.Allocate<int>(a); CUDADriver.cuMemsetD8(d_a, 0xff, nbytes); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx")); CUfunction func = cuda.GetModuleFunction("increment_kernel"); // set kernel launch configuration cuda.SetFunctionBlockShape(func, 512, 1, 1); // create cuda event handles CUevent start = cuda.CreateEvent(); CUevent stop = cuda.CreateEvent(); // asynchronously issue work to the GPU (all to stream 0) CUstream stream = new CUstream(); cuda.RecordEvent(start); cuda.CopyHostToDeviceAsync<int>(d_a, a, stream); // set parameters for kernel function cuda.SetParameter(func, 0, (uint)d_a.Pointer); cuda.SetParameter(func, IntPtr.Size, (uint)value); cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4)); // actually launch kernel cuda.LaunchAsync(func, n / 512, 1, stream); // wait for every thing to finish, then start copy back data cuda.CopyDeviceToHostAsync<int>(d_a, a, stream); cuda.RecordEvent(stop); // print the cpu and gpu times Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop)); // check the output for correctness if (CorrectOutput(a, value)) Console.WriteLine("Test PASSED"); else Console.WriteLine("Test FAILED"); // release resources cuda.DestroyEvent(start); cuda.DestroyEvent(stop); cuda.Free(d_a); }
static void Main(string[] args) { // Create a new instance of CUDA class, select 1st device. CUDA cuda = new CUDA(0, true); // Prepare parameters. int n = 16 * 1024 * 1024; uint nbytes = (uint)(n * sizeof(int)); int value = 26; // allocate host memory int[] a = new int[n]; // allocate device memory CUdeviceptr d_a = cuda.Allocate <int>(a); CUDADriver.cuMemsetD8(d_a, 0xff, nbytes); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx")); CUfunction func = cuda.GetModuleFunction("increment_kernel"); // set kernel launch configuration cuda.SetFunctionBlockShape(func, 512, 1, 1); // create cuda event handles CUevent start = cuda.CreateEvent(); CUevent stop = cuda.CreateEvent(); // asynchronously issue work to the GPU (all to stream 0) CUstream stream = new CUstream(); cuda.RecordEvent(start); cuda.CopyHostToDeviceAsync <int>(d_a, a, stream); // set parameters for kernel function cuda.SetParameter(func, 0, (uint)d_a.Pointer); cuda.SetParameter(func, IntPtr.Size, (uint)value); cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4)); // actually launch kernel cuda.LaunchAsync(func, n / 512, 1, stream); // wait for every thing to finish, then start copy back data cuda.CopyDeviceToHostAsync <int>(d_a, a, stream); cuda.RecordEvent(stop); // print the cpu and gpu times Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop)); // check the output for correctness if (CorrectOutput(a, value)) { Console.WriteLine("Test PASSED"); } else { Console.WriteLine("Test FAILED"); } // release resources cuda.DestroyEvent(start); cuda.DestroyEvent(stop); cuda.Free(d_a); }