CudaRuntimeCompiler.Compile C# (CSharp)代码示例

示例#1

0

显示文件

文件： KernelLoader.cs 项目： NSqda/XCoinTrader

        private nvrtcResult LoadKernel(string path, out CudaKernel kernel, out string log)
        {
            nvrtcResult result;

            using (var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path)))
            {
                try
                {
                    rtc.Compile(new string[0]); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options
                    result = nvrtcResult.Success;
                }
                catch (NVRTCException ex)
                {
                    result = ex.NVRTCError;
                }

                log = rtc.GetLogAsString();

                if (result == nvrtcResult.Success)
                {
                    var ptx = rtc.GetPTX();
                    kernel = this._context.CudaContext.LoadKernelFatBin(ptx, "Run"); // hard-coded method name from the CUDA kernel
                }
                else
                {
                    kernel = null;
                }
            }

            return(result);
        }

示例#2

0

显示文件

文件： TensorOpGpu.KernelLoader.cs 项目： uzbekdev1/Neuro

            private nvrtcResult LoadKernel(string kernelSourceFile, out CudaKernel kernel)
            {
                nvrtcResult result;

                kernel = null;

                using (var compiler = new CudaRuntimeCompiler(File.ReadAllText(kernelSourceFile), Path.GetFileName(kernelSourceFile)))
                {
                    try
                    {
                        compiler.Compile(new string[0]);
                        result = nvrtcResult.Success;
                    }
                    catch (NVRTCException ex)
                    {
                        result = ex.NVRTCError;
                    }

                    var outputFileWithoutExt = Path.Combine(Path.GetDirectoryName(kernelSourceFile), Path.GetFileNameWithoutExtension(kernelSourceFile));
                    File.WriteAllText(outputFileWithoutExt + ".ptx.log", compiler.GetLogAsString());

                    if (result == nvrtcResult.Success)
                    {
                        var ptx = compiler.GetPTX();
                        kernel = _CudaContext.LoadKernelFatBin(ptx, "Run");
                        File.WriteAllBytes(outputFileWithoutExt + ".ptx", ptx);
                    }
                }
                return(result);
            }

示例#3

0

显示文件

文件： CudaProcessor.cs 项目： Lordsklilen/SimpleCUDA

        internal nvrtcResult LoadKernel(out string log)
        {
            nvrtcResult result;

            using (var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path)))
            {
                try
                {
                    rtc.Compile(Array.Empty <string>());
                    result = nvrtcResult.Success;
                }
                catch (NVRTCException ex)
                {
                    result = ex.NVRTCError;
                }
                log = rtc.GetLogAsString();

                if (result == nvrtcResult.Success)
                {
                    byte[] ptx = rtc.GetPTX();
                    multiply = ctx.LoadKernelFatBin(ptx, methodName);
                }
            }
            return(result);
        }

示例#4

0

显示文件

        internal void LoadKernel()
        {
            ctx = new CudaContext(0, true);
            nvrtcResult result;

            using var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path));
            rtc.Compile(Array.Empty <string>());
            result = nvrtcResult.Success;
            if (result == nvrtcResult.Success)
            {
                byte[] ptx = rtc.GetPTX();
                kernel = ctx.LoadKernelFatBin(ptx, methodName);
            }
        }

示例#5

0

显示文件

        private static byte[] prepare_kernel(string kernelName)
        {
            string fileToCompile = File.ReadAllText("./assets/CUDA_kernels/" + kernelName + ".cu");

            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, kernelName);

            rtc.Compile(new string[] {});

            string log = rtc.GetLogAsString();

            Console.WriteLine(log);

            byte[] ptx = rtc.GetPTX();

            rtc.Dispose();

            return(ptx);
        }

示例#6

0

显示文件

    protected void InitializeCUDA()
    {
        string[] filetext = new string[cudafiles.Length];
        cudaKernel = new CudaKernel[cudafiles.Length];
        ctx        = new CudaContext(0);

        for (int i = 0; i < cudafiles.Length; ++i)
        {
            filetext[i] = File.ReadAllText(Application.dataPath + @"\Scripts\CUDA\" + cudafiles[i] + ".cu");
            Debug.Log(filetext[i]);

            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(filetext[i], cudafiles[i]);
            rtc.Compile(CompileOption);
            Debug.Log(rtc.GetLogAsString());

            byte[] ptx = rtc.GetPTX();
            rtc.Dispose();

            cudaKernel[i] = ctx.LoadKernelPTX(ptx, cudafiles[i]);
        }
    }

示例#7

0

显示文件

文件： BlurPerformanceTests.cs 项目： strict-lang/Strict

        public void CompileKernel()
        {
            //generate as output language obviously from strict code
            var code =
                @"extern ""C"" __global__ void blur(unsigned char* image, unsigned char* output, size_t width, size_t height)
{
  size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  if (tid > width && tid < width*height-width) {
    output[tid] = image[tid];// (image[tid-2048]+image[tid-1]+image[tid]+image[tid+1]+image[tid+2048])/5;
  }
}";

            using var rtc = new CudaRuntimeCompiler(code, "blur");
            try
            {
                // Use max capabilities on actual hardware we have at runtime
                var computeVersion     = CudaContext.GetDeviceComputeCapability(0);
                var shaderModelVersion = "" + computeVersion.Major + computeVersion.Minor;
                Console.WriteLine("ShaderModelVersion=" + shaderModelVersion);
                // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options
                //https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/
                //nvcc .\vectorAdd.cu -use_fast_math -ptx -m 64 -arch compute_61 -code sm_61 -o .\vectorAdd.ptx
                //https://docs.nvidia.com/cuda/nvrtc/index.html#group__options
                rtc.Compile(new[] { "--gpu-architecture=compute_" + shaderModelVersion });
                Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString());
                var deviceID = 0;
                var ctx      = new CudaContext(deviceID);
                kernel = ctx.LoadKernelPTX(rtc.GetPTX(), "blur");
                kernel.GridDimensions  = (Size + 511) / 512;
                kernel.BlockDimensions = 512;
                //unused: float[] copyInput = new float[Size];
                input  = image;
                output = new CudaDeviceVariable <byte>(Size);
            }
            catch (NVRTCException ex)
            {
                Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString());
                throw new Exception(ex.NVRTCError + " " + ex);
            }
        }

示例#8

0

显示文件

文件： CUDAVectorPrimitivesProvider.cs 项目： hanshaoqiang/vector-accelerator

        public void Compile()
        {
            using (var ctx = new CudaContext())
            {
                // with verbaim string @, we only have to double up double quotes: no other escaping
                string source = @"
                extern ""C"" __global__ 
                void saxpy(float a, float *x, float *y, float *out, size_t n)
                { 
	                size_t tid = blockIdx.x * blockDim.x + threadIdx.x; 
	                if (tid < n) 
	                { 
		                out[tid] = a * x[tid] + y[tid]; 
	                } 
                }
                ";

                source += Environment.NewLine;

                var name         = "Test";
                var headers      = new string[0];
                var includeNames = new string[0];

                var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames);

                //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames);
                // --ptxas-options=-v -keep
                compiler.Compile(new string[] { "-G" });

                //var ptxString = compiler.GetPTXAsString(); // for debugging

                var ptx = compiler.GetPTX();

                //compiler2.Compile(new string[] { });

                var kernel = ctx.LoadKernelPTX(ptx, "kernelName");

                //One kernel per cu file:
                //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname");
                kernel.GridDimensions  = new dim3(1, 1, 1);
                kernel.BlockDimensions = new dim3(16, 16);

                //kernel.Run()

                var a = new CudaDeviceVariable <double>(100);
                //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr()

                //Multiple kernels per cu file:
                CUmodule   cumodule = ctx.LoadModule(@"path\to\kernel.ptx");
                CudaKernel kernel1  = new CudaKernel("kernel1", cumodule, ctx)
                {
                    GridDimensions  = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
                CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx)
                {
                    GridDimensions  = new dim3(1, 1, 1),
                    BlockDimensions = new dim3(16, 16),
                };
            }
        }

示例#9

0

显示文件

        static void Main(string[] args)
        {
            string filename      = "vectorAdd_kernel.cu"; //we assume the file is in the same folder...
            string fileToCompile = File.ReadAllText(filename);


            CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel");

            rtc.Compile(args);

            string log = rtc.GetLogAsString();

            Console.WriteLine(log);

            byte[] ptx = rtc.GetPTX();

            rtc.Dispose();

            CudaContext ctx = new CudaContext(0);

            CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd");


            // Print the vector length to be used, and compute its size
            int   numElements = 50000;
            SizeT size        = numElements * sizeof(float);

            Console.WriteLine("[Vector addition of {0} elements]", numElements);

            // Allocate the host input vector A
            float[] h_A = new float[numElements];
            // Allocate the host input vector B
            float[] h_B = new float[numElements];
            // Allocate the host output vector C
            float[] h_C = new float[numElements];

            Random rand = new Random(0);

            // Initialize the host input vectors
            for (int i = 0; i < numElements; ++i)
            {
                h_A[i] = (float)rand.NextDouble();
                h_B[i] = (float)rand.NextDouble();
            }

            Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n");
            // Allocate the device input vector A and copy to device
            CudaDeviceVariable <float> d_A = h_A;

            // Allocate the device input vector B and copy to device
            CudaDeviceVariable <float> d_B = h_B;

            // Allocate the device output vector C
            CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(numElements);

            // Launch the Vector Add CUDA Kernel
            int threadsPerBlock = 256;
            int blocksPerGrid   = (numElements + threadsPerBlock - 1) / threadsPerBlock;

            Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock);
            vectorAdd.BlockDimensions = new dim3(threadsPerBlock, 1, 1);
            vectorAdd.GridDimensions  = new dim3(blocksPerGrid, 1, 1);

            vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements);

            // Copy the device result vector in device memory to the host result vector
            // in host memory.
            Console.WriteLine("Copy output data from the CUDA device to the host memory\n");
            d_C.CopyToHost(h_C);


            // Verify that the result vector is correct
            for (int i = 0; i < numElements; ++i)
            {
                if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5)
                {
                    Console.WriteLine("Result verification failed at element {0}!\n", i);
                    return;
                }
            }

            Console.WriteLine("Test PASSED\n");

            // Free device global memory
            d_A.Dispose();
            d_B.Dispose();
            d_C.Dispose();

            ctx.Dispose();
            Console.WriteLine("Done\n");
        }

C# (CSharp) CudaRuntimeCompiler.Compile示例