private static extern curandStatus curandSetStream(RandGenerator generator, CUstream stream);
public static extern CUResult cuStreamQuery(CUstream hStream);
public static extern CUResult cuStreamSynchronize(CUstream hStream);
public static extern CUResult cuStreamCreate(ref CUstream phStream, uint Flags);
public static extern CUResult cuStreamDestroy_v2(CUstream hStream);
public static extern CUResult cuGLMapBufferObjectAsync(ref CUdeviceptr dptr, ref uint size, uint bufferobj, CUstream hStream);
public static extern CUResult cuEventRecord(CUevent hEvent, CUstream hStream);
public static extern CUResult cuMemcpyDtoHAsync(IntPtr dstHost, CUdeviceptr srcDevice, SizeT ByteCount, CUstream hStream);
public static extern CUResult cuMemcpyPeerAsync(CUdeviceptr dstDevice, CUcontext dstContext, CUdeviceptr srcDevice, CUcontext srcContext, SizeT ByteCount, CUstream hStream);
public static extern CUResult cuMemcpyAtoHAsync(IntPtr dstHost, CUarray srcArray, SizeT SrcIndex, SizeT ByteCount, CUstream hStream);
public static extern CUResult cuMemcpyDtoDAsync(CUdeviceptr dstDevice, CUdeviceptr srcDevice, SizeT ByteCount, CUstream hStream);
public static extern CUResult cuMemcpy3DAsync(ref CUDAMemCpy3D pCopy, CUstream hStream);
public static extern CUResult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
public static extern CUResult cuGraphicsUnmapResources(uint count, [In] CUgraphicsResource[] resources, CUstream hStream);
public curandStatus SetStream(RandGenerator generator, CUstream stream) { return curandSetStream(generator, stream); }
public static extern CUResult cuMemcpyHtoAAsync(CUarray dstArray, SizeT dstIndex, IntPtr pSrc, SizeT ByteCount, CUstream hStream);
static void Main(string[] args) { // Create a new instance of CUDA class, select 1st device. CUDA cuda = new CUDA(0, true); // Prepare parameters. int n = 16 * 1024 * 1024; uint nbytes = (uint)(n * sizeof(int)); int value = 26; // allocate host memory int[] a = new int[n]; // allocate device memory CUdeviceptr d_a = cuda.Allocate<int>(a); CUDADriver.cuMemsetD8(d_a, 0xff, nbytes); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx")); CUfunction func = cuda.GetModuleFunction("increment_kernel"); // set kernel launch configuration cuda.SetFunctionBlockShape(func, 512, 1, 1); // create cuda event handles CUevent start = cuda.CreateEvent(); CUevent stop = cuda.CreateEvent(); // asynchronously issue work to the GPU (all to stream 0) CUstream stream = new CUstream(); cuda.RecordEvent(start); cuda.CopyHostToDeviceAsync<int>(d_a, a, stream); // set parameters for kernel function cuda.SetParameter(func, 0, (uint)d_a.Pointer); cuda.SetParameter(func, IntPtr.Size, (uint)value); cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4)); // actually launch kernel cuda.LaunchAsync(func, n / 512, 1, stream); // wait for every thing to finish, then start copy back data cuda.CopyDeviceToHostAsync<int>(d_a, a, stream); cuda.RecordEvent(stop); // print the cpu and gpu times Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop)); // check the output for correctness if (CorrectOutput(a, value)) Console.WriteLine("Test PASSED"); else Console.WriteLine("Test FAILED"); // release resources cuda.DestroyEvent(start); cuda.DestroyEvent(stop); cuda.Free(d_a); }
public static extern CUResult cuMemcpyHtoDAsync(CUdeviceptr dstDevice, IntPtr srcHost, SizeT ByteCount, CUstream hStream);
public static extern CUResult cuGLUnmapBufferObjectAsync(uint bufferobj, CUstream hStream);
unsafe public FlaCudaTask(CUDA _cuda, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify) { cuda = _cuda; residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames; bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames; samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount; int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames; int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames; int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FlaCudaWriter.maxFrames; cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2); cudaSamples = cuda.Allocate((uint)samplesBufferLen); cudaResidual = cuda.Allocate((uint)samplesBufferLen); cudaLPCData = cuda.Allocate((uint)lpcDataLen); cudaPartitions = cuda.Allocate((uint)partitionsLen); cudaRiceParams = cuda.Allocate((uint)riceParamsLen); cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames))); cudaResidualTasks = cuda.Allocate((uint)residualTasksLen); cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames)); CUResult cuErr = CUResult.Success; if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen); if (cuErr != CUResult.Success) { if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero; if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero; if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero; if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero; if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero; throw new CUDAException(cuErr); } cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr"); cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr"); cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2"); cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaQuantizeLPC = cuda.GetModuleFunction("cudaQuantizeLPC"); cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice"); cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8"); cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12"); cudaEstimateResidual1 = cuda.GetModuleFunction("cudaEstimateResidual1"); cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod"); cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod"); cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition"); cudaCalcPartition16 = cuda.GetModuleFunction("cudaCalcPartition16"); cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition"); cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition"); cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter"); cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder"); stream = cuda.CreateStream(); samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1]; frame = new FlacFrame(channelCount); frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length); if (do_verify) { verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100)); verify.DoCRC = false; } }