private nvrtcResult LoadKernel(string path, out CudaKernel kernel, out string log) { nvrtcResult result; using (var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path))) { try { rtc.Compile(new string[0]); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options result = nvrtcResult.Success; } catch (NVRTCException ex) { result = ex.NVRTCError; } log = rtc.GetLogAsString(); if (result == nvrtcResult.Success) { var ptx = rtc.GetPTX(); kernel = this._context.CudaContext.LoadKernelFatBin(ptx, "Run"); // hard-coded method name from the CUDA kernel } else { kernel = null; } } return(result); }
private nvrtcResult LoadKernel(string kernelSourceFile, out CudaKernel kernel) { nvrtcResult result; kernel = null; using (var compiler = new CudaRuntimeCompiler(File.ReadAllText(kernelSourceFile), Path.GetFileName(kernelSourceFile))) { try { compiler.Compile(new string[0]); result = nvrtcResult.Success; } catch (NVRTCException ex) { result = ex.NVRTCError; } var outputFileWithoutExt = Path.Combine(Path.GetDirectoryName(kernelSourceFile), Path.GetFileNameWithoutExtension(kernelSourceFile)); File.WriteAllText(outputFileWithoutExt + ".ptx.log", compiler.GetLogAsString()); if (result == nvrtcResult.Success) { var ptx = compiler.GetPTX(); kernel = _CudaContext.LoadKernelFatBin(ptx, "Run"); File.WriteAllBytes(outputFileWithoutExt + ".ptx", ptx); } } return(result); }
internal nvrtcResult LoadKernel(out string log) { nvrtcResult result; using (var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path))) { try { rtc.Compile(Array.Empty <string>()); result = nvrtcResult.Success; } catch (NVRTCException ex) { result = ex.NVRTCError; } log = rtc.GetLogAsString(); if (result == nvrtcResult.Success) { byte[] ptx = rtc.GetPTX(); multiply = ctx.LoadKernelFatBin(ptx, methodName); } } return(result); }
internal void LoadKernel() { ctx = new CudaContext(0, true); nvrtcResult result; using var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path)); rtc.Compile(Array.Empty <string>()); result = nvrtcResult.Success; if (result == nvrtcResult.Success) { byte[] ptx = rtc.GetPTX(); kernel = ctx.LoadKernelFatBin(ptx, methodName); } }
private static byte[] prepare_kernel(string kernelName) { string fileToCompile = File.ReadAllText("./assets/CUDA_kernels/" + kernelName + ".cu"); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, kernelName); rtc.Compile(new string[] {}); string log = rtc.GetLogAsString(); Console.WriteLine(log); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); return(ptx); }
protected void InitializeCUDA() { string[] filetext = new string[cudafiles.Length]; cudaKernel = new CudaKernel[cudafiles.Length]; ctx = new CudaContext(0); for (int i = 0; i < cudafiles.Length; ++i) { filetext[i] = File.ReadAllText(Application.dataPath + @"\Scripts\CUDA\" + cudafiles[i] + ".cu"); Debug.Log(filetext[i]); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(filetext[i], cudafiles[i]); rtc.Compile(CompileOption); Debug.Log(rtc.GetLogAsString()); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); cudaKernel[i] = ctx.LoadKernelPTX(ptx, cudafiles[i]); } }
public void CompileKernel() { //generate as output language obviously from strict code var code = @"extern ""C"" __global__ void blur(unsigned char* image, unsigned char* output, size_t width, size_t height) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid > width && tid < width*height-width) { output[tid] = image[tid];// (image[tid-2048]+image[tid-1]+image[tid]+image[tid+1]+image[tid+2048])/5; } }"; using var rtc = new CudaRuntimeCompiler(code, "blur"); try { // Use max capabilities on actual hardware we have at runtime var computeVersion = CudaContext.GetDeviceComputeCapability(0); var shaderModelVersion = "" + computeVersion.Major + computeVersion.Minor; Console.WriteLine("ShaderModelVersion=" + shaderModelVersion); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options //https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ //nvcc .\vectorAdd.cu -use_fast_math -ptx -m 64 -arch compute_61 -code sm_61 -o .\vectorAdd.ptx //https://docs.nvidia.com/cuda/nvrtc/index.html#group__options rtc.Compile(new[] { "--gpu-architecture=compute_" + shaderModelVersion }); Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString()); var deviceID = 0; var ctx = new CudaContext(deviceID); kernel = ctx.LoadKernelPTX(rtc.GetPTX(), "blur"); kernel.GridDimensions = (Size + 511) / 512; kernel.BlockDimensions = 512; //unused: float[] copyInput = new float[Size]; input = image; output = new CudaDeviceVariable <byte>(Size); } catch (NVRTCException ex) { Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString()); throw new Exception(ex.NVRTCError + " " + ex); } }
public void Compile() { using (var ctx = new CudaContext()) { // with verbaim string @, we only have to double up double quotes: no other escaping string source = @" extern ""C"" __global__ void saxpy(float a, float *x, float *y, float *out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { out[tid] = a * x[tid] + y[tid]; } } "; source += Environment.NewLine; var name = "Test"; var headers = new string[0]; var includeNames = new string[0]; var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames); //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames); // --ptxas-options=-v -keep compiler.Compile(new string[] { "-G" }); //var ptxString = compiler.GetPTXAsString(); // for debugging var ptx = compiler.GetPTX(); //compiler2.Compile(new string[] { }); var kernel = ctx.LoadKernelPTX(ptx, "kernelName"); //One kernel per cu file: //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname"); kernel.GridDimensions = new dim3(1, 1, 1); kernel.BlockDimensions = new dim3(16, 16); //kernel.Run() var a = new CudaDeviceVariable <double>(100); //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr() //Multiple kernels per cu file: CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx"); CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; } }
static void Main(string[] args) { string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder... string fileToCompile = File.ReadAllText(filename); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel"); rtc.Compile(args); string log = rtc.GetLogAsString(); Console.WriteLine(log); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); CudaContext ctx = new CudaContext(0); CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd"); // Print the vector length to be used, and compute its size int numElements = 50000; SizeT size = numElements * sizeof(float); Console.WriteLine("[Vector addition of {0} elements]", numElements); // Allocate the host input vector A float[] h_A = new float[numElements]; // Allocate the host input vector B float[] h_B = new float[numElements]; // Allocate the host output vector C float[] h_C = new float[numElements]; Random rand = new Random(0); // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); } Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n"); // Allocate the device input vector A and copy to device CudaDeviceVariable <float> d_A = h_A; // Allocate the device input vector B and copy to device CudaDeviceVariable <float> d_B = h_B; // Allocate the device output vector C CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(numElements); // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock); vectorAdd.BlockDimensions = new dim3(threadsPerBlock, 1, 1); vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1); vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements); // Copy the device result vector in device memory to the host result vector // in host memory. Console.WriteLine("Copy output data from the CUDA device to the host memory\n"); d_C.CopyToHost(h_C); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { Console.WriteLine("Result verification failed at element {0}!\n", i); return; } } Console.WriteLine("Test PASSED\n"); // Free device global memory d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); ctx.Dispose(); Console.WriteLine("Done\n"); }