public float ElapsedTime(CUevent start, CUevent end) { float pMilliseconds = 0f; this.LastError = CUDADriver.cuEventElapsedTime(ref pMilliseconds, start, end); return(pMilliseconds); }
public static void cuEventRecord(CUevent hEvent, CUstream hStream) { Wrap(() => { try { var error = nativeEventRecord(hEvent, hStream); if (error != CUresult.CUDA_SUCCESS) { throw new CudaException(error); } } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } }); }
public CUevent CreateEvent(CUEventFlags flags) { CUevent phEvent = new CUevent(); this.LastError = CUDADriver.cuEventCreate(ref phEvent, (uint)flags); return(phEvent); }
public static ElapsedTime cuEventElapsedTime(CUevent hStart, CUevent hEnd) { return(Wrap(() => { try { float milliseconds; var error = nativeEventElapsedTime(out milliseconds, hStart, hEnd); if (error != CUresult.CUDA_SUCCESS) { throw new CudaException(error); } // note. cannot use TimeSpan here because it ain't work with fractions of milliseconds return new ElapsedTime(milliseconds); } catch (CudaException) { throw; } catch (DllNotFoundException dnfe) { throw new CudaException(CudaError.NoDriver, dnfe); } catch (Exception e) { throw new CudaException(CudaError.Unknown, e); } })); }
/// <summary> /// Creates a new Event /// </summary> /// <param name="flags">Parameters for event creation</param> public CudaEvent(CUEventFlags flags) { _event = new CUevent(); res = DriverAPINativeMethods.Events.cuEventCreate(ref _event, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuEventCreate", res)); if (res != CUResult.Success) throw new CudaException(res); }
public CudaTest(CUDA cuda) { _cuda = cuda; //+ load frame module LoadFrameModule(Cuda.CudaModle.BuiltinModules.Test); //+ create cuda event handles _cudaStartEvent = _cuda.CreateEvent(); _cudaStopEvent = _cuda.CreateEvent(); }
public void DestroyEvent(CUevent e) { if (_version >= 4000) { this.LastError = CUDADriver.cuEventDestroy_v2(e); } else { this.LastError = CUDADriver.cuEventDestroy(e); } }
/// <summary> /// Make a compute stream wait on an event<para/> /// Makes all future work submitted to the Stream wait until <c>hEvent</c> /// reports completion before beginning execution. This synchronization /// will be performed efficiently on the device. /// <para/> /// The stream will wait only for the completion of the most recent /// host call to <see cref="CudaEvent.Record()"/> on <c>hEvent</c>. Once this call has returned, /// any functions (including <see cref="CudaEvent.Record()"/> and <see cref="Dispose()"/> may be /// called on <c>hEvent</c> again, and the subsequent calls will not have any /// effect on this stream. /// <para/> /// If <c>hStream</c> is 0 (the NULL stream) any future work submitted in any stream /// will wait for <c>hEvent</c> to complete before beginning execution. This /// effectively creates a barrier for all future work submitted to the context. /// <para/> /// If <see cref="CudaEvent.Record()"/> has not been called on <c>hEvent</c>, this call acts as if /// the record has already completed, and so is a functional no-op. /// </summary> /// <returns></returns> public void WaitEvent(CUevent cuevent) { if (disposed) { throw new ObjectDisposedException(this.ToString()); } res = DriverAPINativeMethods.Streams.cuStreamWaitEvent(_stream, cuevent, 0); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuStreamWaitEvent", res)); if (res != CUResult.Success) { throw new CudaException(res); } }
public CUDAExecution(GASS.CUDA.CUDA cuda, string module, string function) { this.parameters = new List <Parameter>(); this.textures = new List <CUtexref>(); this.CUDAInstance = cuda; if (!module.EndsWith("cubin")) { module = module + ".cubin"; } FileInfo info = new FileInfo(module); this.Module = info.FullName; this.Function = function; this.CUDAModule = cuda.LoadModule(this.module); this.CUDAFunction = cuda.GetModuleFunction(this.function); this.start = cuda.CreateEvent(); this.stop = cuda.CreateEvent(); }
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_g7895332c94680b174ef41373af09d9ce.html private static extern CUresult nativeEventElapsedTime(out float pMilliseconds, CUevent hStart, CUevent hEnd);
public void RecordEvent(CUevent e) { this.RecordEvent(e, new CUstream()); }
static void Main(string[] args) { // Init and select 1st device. CUDA cuda = new CUDA(0, true); // load module //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.cubin")); cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.ptx")); CUfunction transpose = cuda.GetModuleFunction("transpose"); CUfunction transpose_naive = cuda.GetModuleFunction("transpose_naive"); const int size_x = 4096; const int size_y = 4096; const int mem_size = sizeof(float) * size_x * size_y; float[] h_idata = new float[size_x * size_y]; for (int i = 0; i < h_idata.Length; i++) { h_idata[i] = (float)i; } // allocate device memory // copy host memory to device CUdeviceptr d_idata = cuda.CopyHostToDevice <float>(h_idata); CUdeviceptr d_odata = cuda.Allocate <float>(h_idata); // setup execution parameters cuda.SetFunctionBlockShape(transpose_naive, BLOCK_DIM, BLOCK_DIM, 1); cuda.SetParameter(transpose_naive, 0, (uint)d_odata.Pointer); cuda.SetParameter(transpose_naive, IntPtr.Size, (uint)d_idata.Pointer); cuda.SetParameter(transpose_naive, IntPtr.Size * 2, (uint)size_x); cuda.SetParameter(transpose_naive, IntPtr.Size * 2 + 4, (uint)size_y); cuda.SetParameterSize(transpose_naive, (uint)(IntPtr.Size * 2 + 8)); cuda.SetFunctionBlockShape(transpose, BLOCK_DIM, BLOCK_DIM, 1); cuda.SetParameter(transpose, 0, (uint)d_odata.Pointer); cuda.SetParameter(transpose, IntPtr.Size, (uint)d_idata.Pointer); cuda.SetParameter(transpose, IntPtr.Size * 2, (uint)size_x); cuda.SetParameter(transpose, IntPtr.Size * 2 + 4, (uint)size_y); cuda.SetParameterSize(transpose, (uint)(IntPtr.Size * 2 + 8)); // warmup so we don't time CUDA startup cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM); cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM); //System.Threading.Thread.Sleep(10); int numIterations = 100; Console.WriteLine("Transposing a {0} by {1} matrix of floats...", size_x, size_y); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); cuda.RecordEvent(start); for (int i = 0; i < numIterations; i++) { cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM); } cuda.SynchronizeContext(); cuda.RecordEvent(end); cuda.SynchronizeContext(); float naiveTime = cuda.ElapsedTime(start, end); Console.WriteLine("Naive transpose average time: {0} ms\n", naiveTime / numIterations); cuda.RecordEvent(start); for (int i = 0; i < numIterations; i++) { cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM); } cuda.SynchronizeContext(); cuda.RecordEvent(end); cuda.SynchronizeContext(); float optimizedTime = cuda.ElapsedTime(start, end); Console.WriteLine("Optimized transpose average time: {0} ms\n", optimizedTime / numIterations); float[] h_odata = new float[size_x * size_y]; cuda.CopyDeviceToHost <float>(d_odata, h_odata); float[] reference = new float[size_x * size_y]; computeGold(reference, h_idata, size_x, size_y); bool res = CompareF(reference, h_odata, size_x * size_y); Console.WriteLine("Test {0}", res == true? "PASSED":"FAILED"); cuda.Free(d_idata); cuda.Free(d_odata); Console.ReadKey(); }
/// <summary> /// implementation of sparese matrix product /// </summary> /// <param name="repetition">how many times kernel should be launch</param> /// <param name="moduleFunction">cuda kenrel name</param> /// <param name="blockSizeX">block size X</param> /// <param name="blockSizeY">block size Y</param> /// <param name="transposeGrid">indicate that grid dimensions should be /// computed alternativly, if false than gridDimY- connected with rows /// else gridDim.Y conected with cols</param> /// <returns></returns> public static float[] CRSSparseMM(int repetition, string moduleFunction, int blockSizeX, int blockSizeY, bool transposeGrid) { //int blockSizeX = 4; //int blockSizeY = 4; CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); CUdeviceptr BValsPtr = cuda.CopyHostToDevice(BVals); CUdeviceptr BIdxPtr = cuda.CopyHostToDevice(BIdx); CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //CUdeviceptr dOutput = cuda.Allocate(output); IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; int Belements = BVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Aelements); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Belements); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); //cuda.SetTextureFlags(cuTexRef, 0); int gridDimX = (int)Math.Ceiling((Cols + 0.0) / (blockSizeX)); int gridDimY = (int)Math.Ceiling((0.0 + Rows) / blockSizeY); if (transposeGrid) { gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY); } Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int k = 0; k < repetition; k++) { cuda.Launch(cuFunc, gridDimX, gridDimY); cuda.SynchronizeContext(); // cuda.CopyDeviceToHost(dOutput, output); Marshal.Copy(outputPtr2, output, 0, outputSize); } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(BValsPtr); cuda.Free(BIdxPtr); cuda.Free(BLenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return(output); }
/// <summary> /// Creates a new Event using <see cref="CUEventFlags.Default"/> /// </summary> internal CudaEvent(CUevent event_) { _event = event_; }
public void RecordEvent(CUevent e, CUstream stream) { this.LastError = CUDADriver.cuEventRecord(e, stream); }
public static float[] CRSSparseMMwithDenseVector(int repetition, string moduleFunction, int blockSizeX, int blockSizeY) { CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //allocate memory for output IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); //create dense vector for each column in B matrix float[] mainVec = new float[maxIndex + 1]; uint memSize = (uint)((maxIndex + 1) * sizeof(float)); CUstream stream0 = cuda.CreateStream(); IntPtr[] mainVecIntPtrs = new IntPtr[2]; //write combined memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); // //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); mainVecIntPtrs[0] = cuda.AllocateHost(memSize); mainVecIntPtrs[1] = cuda.AllocateHost(memSize); CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0); //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); //mapped memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize); //get texture reference CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); int colIndexParamOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); int gridDim = (Rows + blockSizeX - 1) / blockSizeX; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int rep = 0; rep < repetition; rep++) { for (int k = 0; k < Cols; k++) { Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream0); cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0); cuda.SetParameter(cuFunc, colIndexParamOffset, (uint)k); cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer Helpers.SetBufferIdx(BIdx, BRowLen, k - 1, mainVecIntPtrs[(k + 1) % 2], 0.0f); //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr); ////make asynchronius copy and kernel lauch //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0); //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f); } } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Marshal.Copy(outputPtr2, output, 0, outputSize); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); cuda.DestroyStream(stream0); cuda.Free(mainVecPtr); cuda.DestroyTexture(cuTexRef); return(output); }
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_g349006734f6e7378ea36cb57c239d4c7.html private static extern CUresult nativeEventDestroy(CUevent hEvent);
//http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_g93468fbdae4190b79926381a90a94301.html private static extern CUresult nativeEventRecord(CUevent hEvent, CUstream hStream);
static void Main(string[] args) { // Create a new instance of CUDA class, select 1st device. CUDA cuda = new CUDA(0, true); // Prepare parameters. int n = 16 * 1024 * 1024; uint nbytes = (uint)(n * sizeof(int)); int value = 26; // allocate host memory int[] a = new int[n]; // allocate device memory CUdeviceptr d_a = cuda.Allocate <int>(a); CUDADriver.cuMemsetD8(d_a, 0xff, nbytes); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx")); CUfunction func = cuda.GetModuleFunction("increment_kernel"); // set kernel launch configuration cuda.SetFunctionBlockShape(func, 512, 1, 1); // create cuda event handles CUevent start = cuda.CreateEvent(); CUevent stop = cuda.CreateEvent(); // asynchronously issue work to the GPU (all to stream 0) CUstream stream = new CUstream(); cuda.RecordEvent(start); cuda.CopyHostToDeviceAsync <int>(d_a, a, stream); // set parameters for kernel function cuda.SetParameter(func, 0, (uint)d_a.Pointer); cuda.SetParameter(func, IntPtr.Size, (uint)value); cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4)); // actually launch kernel cuda.LaunchAsync(func, n / 512, 1, stream); // wait for every thing to finish, then start copy back data cuda.CopyDeviceToHostAsync <int>(d_a, a, stream); cuda.RecordEvent(stop); // print the cpu and gpu times Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop)); // check the output for correctness if (CorrectOutput(a, value)) { Console.WriteLine("Test PASSED"); } else { Console.WriteLine("Test FAILED"); } // release resources cuda.DestroyEvent(start); cuda.DestroyEvent(stop); cuda.Free(d_a); }
public void SynchronizeEvent(CUevent e) { this.LastError = CUDADriver.cuEventSynchronize(e); }
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_g433317083f929b9298f8a88d57aa5017.html private static extern CUresult nativeEventCreate(out CUevent phEvent, CUevent_flags Flags);
// http://developer.download.nvidia.com/compute/cuda/3_1/toolkit/docs/online/group__CUEVENT_ge3ed6a308c602d139373895cb99cb7ab.html private static extern CUresult nativeEventSynchronize(CUevent hEvent);
public static void cuEventRecord(CUevent hEvent) { cuEventRecord(hEvent, CUstream.Null); }