public void CUDA_AddFloatArrays() { //Load Kernel image from resources Stream stream = new StreamReader(resName).BaseStream; if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd"); var threadsPerBlock = 1024; vectorAddKernel.BlockDimensions = threadsPerBlock; vectorAddKernel.GridDimensions = (Count + threadsPerBlock - 1) / threadsPerBlock; CudaStopWatch w = new CudaStopWatch(); w.Start(); vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, C.DevicePointer, Count); w.Stop(); Debug.Log(w.GetElapsedTime() / 1000.0f); Debug.Log($"{h_A[0]} + {h_B[0]} = {C[0]}"); Debug.Log($"{h_A[Count-1]} + {h_B[Count-1]} = {C[Count-1]}"); // Copy result from device memory to host memory // h_C contains the result in host memory // h_C = d_C; }
private T[] RunKernel <T>(Action <T[]> method, T[] parameters) where T : struct { var methodInfo = method.Method; string[] kernels; string llvmIr, ptxIr; var ptx = CudaSharp.CudaSharp.Translate(out kernels, out llvmIr, out ptxIr, "sm_20", methodInfo); Console.WriteLine(llvmIr); Console.WriteLine(ptxIr); var kernel = _context.LoadKernelPTX(ptx, kernels[0]); var maxThreads = kernel.MaxThreadsPerBlock; if (parameters.Length <= maxThreads) { kernel.BlockDimensions = parameters.Length; kernel.GridDimensions = 1; } else { kernel.BlockDimensions = maxThreads; kernel.GridDimensions = parameters.Length / maxThreads; if ((kernel.BlockDimensions * kernel.GridDimensions) != parameters.Length) { throw new Exception(string.Format("Invalid parameters size (must be <= {0} or a multiple of {0}", maxThreads)); } } var gpuMem = new CudaDeviceVariable <T>(parameters.Length); gpuMem.CopyToDevice(parameters); kernel.Run(gpuMem.DevicePointer); gpuMem.CopyToHost(parameters); gpuMem.Dispose(); return(parameters); }
public CudaKernel Get(CudaContext context, byte[] ptx, string kernelName) { lock (locker) { try { if (activeKernels.TryGetValue(Tuple.Create(context, ptx, kernelName), out CudaKernel value)) { return(value); } else { value = context.LoadKernelPTX(ptx, kernelName); activeKernels.Add(Tuple.Create(context, ptx, kernelName), value); return(value); } } catch (Exception err) { Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Exception: '{err.Message}'"); Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Call stack: '{err.StackTrace}'"); throw err; } } }
static CudaKernel BuildKernelFromFunction(string functionName, ref CudaContext context) { //CudaContext newContext = new CudaContext(); CudaKernel kernel = context.LoadKernelPTX(PTX_NAME, functionName); kernel.BlockDimensions = THREADS_PER_BLOCK; kernel.GridDimensions = BLOCKS_PER_GRID; return(kernel); }
//static float3[] h_A; //static float3[] h_C; //static CudaDeviceVariable<float3> d_A; //static CudaDeviceVariable<float3> d_C; public CalculateHeatmap() { ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); dev = ctx.GetDeviceInfo(); Console.WriteLine("Using CUDA Device {0} compute level {1} timeout {2}", dev.DeviceName, dev.ComputeCapability, dev.KernelExecTimeoutEnabled ? "enabled" : "disabled"); string resName; resName = @"C:\WEDEV\GpuImplementations\GpuInterpolation\RasterInterpolation_x64.ptx"; Console.WriteLine("Loading Interpolation Kernel"); InterpolateKernel = ctx.LoadKernelPTX(resName, "RasterInterpolate"); }
public GrabCutUtils() { ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId(), false); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "GrabCutUtils_x64.ptx"; } else { resName = "GrabCutUtils.ptx"; } string resNamespace = "GrabCutNPP"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } byte[] kernel = new byte[stream.Length]; int bytesToRead = (int)stream.Length; while (bytesToRead > 0) { bytesToRead -= stream.Read(kernel, (int)stream.Position, bytesToRead); } TrimapFromRectKernel = ctx.LoadKernelPTX(kernel, "_Z20TrimapFromRectKernelPhi8NppiRectii"); ApplyMatteKernelMode0 = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi0EEvP6uchar4iPKS0_iPKhiii"); ApplyMatteKernelMode1 = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi1EEvP6uchar4iPKS0_iPKhiii"); ApplyMatteKernelMode2 = ctx.LoadKernelPTX(kernel, "_Z16ApplyMatteKernelILi2EEvP6uchar4iPKS0_iPKhiii"); convertRGBToRGBAKernel = ctx.LoadKernelPTX(kernel, "_Z22convertRGBToRGBAKernelP6uchar4iP6uchar3iii"); }
/// <summary> /// Gets the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="ptx">The PTX.</param> /// <param name="kernelName">Name of the kernel.</param> /// <returns>CudaKernel.</returns> public CudaKernel Get(CudaContext context, byte[] ptx, string kernelName) { CudaKernel value; if (activeKernels.TryGetValue(Tuple.Create(context, ptx, kernelName), out value)) { return(value); } else { value = context.LoadKernelPTX(ptx, kernelName); activeKernels.Add(Tuple.Create(context, ptx, kernelName), value); return(value); } }
unsafe static public Int64 GPUSobel(Bitmap image, byte[] grayData) { int width = image.Width; int height = image.Height; var context = new CudaContext(); dim3 blockDim = new dim3(16, 16); uint gridX = (uint)(width + blockDim.x - 1) / blockDim.x; uint gridY = (uint)(height + blockDim.y - 1) / blockDim.y; dim3 gridDim = new dim3(gridX, gridY); CudaKernel kernel = context.LoadKernelPTX("Kernel.ptx", "Sobel"); kernel.BlockDimensions = blockDim; kernel.GridDimensions = gridDim; BitmapData imageData = image.LockBits(new Rectangle(0, 0, width, height), ImageLockMode.ReadWrite, image.PixelFormat); uint* ptr = (uint*)imageData.Scan0.ToPointer(); // An unsigned int pointer. This points to the image data in memory, each uint is one pixel ARGB int stride = imageData.Stride / 4; // Stride is the width of one pixel row, including any padding. In bytes, /4 converts to 4 byte pixels CudaDeviceVariable<byte> deviceGrayData = grayData; CudaDeviceVariable<uint> output = new CudaDeviceVariable<uint>(width * height); Stopwatch sw = Stopwatch.StartNew(); kernel.Run(deviceGrayData.DevicePointer, output.DevicePointer, width, height); sw.Stop(); Int64 ticks = sw.ElapsedTicks; uint[] filteredImage = output; int index = 0; for (int i = 1; i < height; ++i) { for (int j = 1; j < width; ++j) { *(ptr + i * stride + j) = filteredImage[index++]; } } for (int x = 0; x < width; ++x) { *(ptr + (height - 1) * stride + x) = 0; *(ptr + x) = 0; } for (int y = 0; y < height; ++y) { *(ptr + y * stride) = 0; *(ptr + y * stride + width - 1) = 0; } // Finish with image and save image.UnlockBits(imageData); return ticks; }
static void Test(byte[] ptxFile) { const int size = 16; var context = new CudaContext(); var kernel = context.LoadKernelPTX(ptxFile, "kernel"); var memory = context.AllocateMemory(4 * size); var gpuMemory = new CudaDeviceVariable<int>(memory); var cpuMemory = new int[size]; for (var i = 0; i < size; i++) cpuMemory[i] = i - 2; gpuMemory.CopyToDevice(cpuMemory); kernel.BlockDimensions = 4; kernel.GridDimensions = 4; kernel.Run(memory); gpuMemory.CopyToHost(cpuMemory); for (var i = 0; i < size; i++) Console.WriteLine("{0} = {1}", i, cpuMemory[i]); }
protected void InitializeCUDA() { string[] filetext = new string[cudafiles.Length]; cudaKernel = new CudaKernel[cudafiles.Length]; ctx = new CudaContext(0); for (int i = 0; i < cudafiles.Length; ++i) { filetext[i] = File.ReadAllText(Application.dataPath + @"\Scripts\CUDA\" + cudafiles[i] + ".cu"); Debug.Log(filetext[i]); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(filetext[i], cudafiles[i]); rtc.Compile(CompileOption); Debug.Log(rtc.GetLogAsString()); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); cudaKernel[i] = ctx.LoadKernelPTX(ptx, cudafiles[i]); } }
public void CompileKernel() { //generate as output language obviously from strict code var code = @"extern ""C"" __global__ void blur(unsigned char* image, unsigned char* output, size_t width, size_t height) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid > width && tid < width*height-width) { output[tid] = image[tid];// (image[tid-2048]+image[tid-1]+image[tid]+image[tid+1]+image[tid+2048])/5; } }"; using var rtc = new CudaRuntimeCompiler(code, "blur"); try { // Use max capabilities on actual hardware we have at runtime var computeVersion = CudaContext.GetDeviceComputeCapability(0); var shaderModelVersion = "" + computeVersion.Major + computeVersion.Minor; Console.WriteLine("ShaderModelVersion=" + shaderModelVersion); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options //https://arnon.dk/matching-sm-architectures-arch-and-gencode-for-various-nvidia-cards/ //nvcc .\vectorAdd.cu -use_fast_math -ptx -m 64 -arch compute_61 -code sm_61 -o .\vectorAdd.ptx //https://docs.nvidia.com/cuda/nvrtc/index.html#group__options rtc.Compile(new[] { "--gpu-architecture=compute_" + shaderModelVersion }); Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString()); var deviceID = 0; var ctx = new CudaContext(deviceID); kernel = ctx.LoadKernelPTX(rtc.GetPTX(), "blur"); kernel.GridDimensions = (Size + 511) / 512; kernel.BlockDimensions = 512; //unused: float[] copyInput = new float[Size]; input = image; output = new CudaDeviceVariable <byte>(Size); } catch (NVRTCException ex) { Console.WriteLine("Cuda compile log: " + rtc.GetLogAsString()); throw new Exception(ex.NVRTCError + " " + ex); } }
protected T[] InternalExecuteCuda <T>( byte[] kernelBinary, String function, int bufferSize, ParallelTaskParams loaderParams, params Object[] kernelParams) where T : struct { TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointStart); CudaContext context = ContextWithDevice(loaderParams.CudaDevice); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformInit); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelBuild); CudaDeviceVariable <T> resultBufferVar = new CudaDeviceVariable <T>(bufferSize); resultBufferVar.Memset(0); List <Tuple <Object, IDisposable> > vars = new List <Tuple <Object, IDisposable> >(); vars.Add(new Tuple <Object, IDisposable>(resultBufferVar.DevicePointer, resultBufferVar)); vars.AddRange(WrapDeviceVariables(kernelParams, true)); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceWrite); CudaKernel kernel = context.LoadKernelPTX(kernelBinary, function); kernel.BlockDimensions = new dim3(loaderParams.BlockSize.Width, loaderParams.BlockSize.Height); kernel.GridDimensions = new dim3(loaderParams.GridSize.Width, loaderParams.GridSize.Height); kernel.Run(vars.Select(tuple => tuple.Item1).ToArray()); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointKernelExecute); T[] resultBuffer = resultBufferVar; TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointDeviceRead); vars.Where(tuple => tuple.Item2 != null).ToList().ForEach(tuple => tuple.Item2.Dispose()); TriggerCheckpoint(ParallelExecutionCheckpointType.CheckpointPlatformDeinit); return(resultBuffer); }
public BarycentricCuda(int width, int height) { Width = width; Height = height; //Init Cuda context ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); //Load Kernel image from resources string resName = "bary.ptx"; string resNamespace = "BarycentricCudaLib"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } baryKernel = ctx.LoadKernelPTX(stream, "baryKernel"); }
static void Main(string[] args) { var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleOccupancy.simpleOccupancy.ptx"; ctx = new CudaContext(0); string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { kernel = ctx.LoadKernelPTX(stream, "square"); } Console.WriteLine("starting Simple Occupancy"); Console.WriteLine(); Console.WriteLine("[ Manual configuration with {0} threads per block ]", manualBlockSize); int status = test(false); if (status != 0) { Console.WriteLine("Test failed"); return; } Console.WriteLine(); Console.WriteLine("[ Automatic, occupancy-based configuration ]"); status = test(true); if (status != 0) { Console.WriteLine("Test failed"); return; } Console.WriteLine(); Console.WriteLine("Test PASSED"); }
static void Main(string[] args) { int N = 50000; int deviceID = 0; ManagedCuda.CudaContext ctx = new CudaContext(deviceID); string ptx = @"// // Generated by NVIDIA NVVM Compiler // // Compiler Build ID: CL-21112126 // Cuda compilation tools, release 8.0, V8.0.43 // Based on LLVM 3.4svn // .version 5.0 .target sm_20, debug .address_size 64 // .globl VecAdd .visible .entry VecAdd( .param .u64 VecAdd_param_0, .param .u64 VecAdd_param_1, .param .u64 VecAdd_param_2, .param .u32 VecAdd_param_3 ) { .reg .pred %p<3>; .reg .f32 %f<4>; .reg .b32 %r<7>; .reg .b64 %rd<13>; .loc 1 27 1 func_begin0: .loc 1 0 0 .loc 1 27 1 ld.param.u64 %rd1, [VecAdd_param_0]; ld.param.u64 %rd2, [VecAdd_param_1]; ld.param.u64 %rd3, [VecAdd_param_2]; ld.param.u32 %r2, [VecAdd_param_3]; func_exec_begin0: .loc 1 29 1 tmp0: mov.u32 %r3, %ntid.x; mov.u32 %r4, %ctaid.x; mul.lo.s32 %r5, %r3, %r4; mov.u32 %r6, %tid.x; add.s32 %r1, %r5, %r6; tmp1: .loc 1 30 1 setp.lt.s32 %p1, %r1, %r2; not.pred %p2, %p1; @%p2 bra BB0_2; bra.uni BB0_1; BB0_1: .loc 1 31 1 tmp2: cvt.s64.s32 %rd4, %r1; shl.b64 %rd5, %rd4, 2; add.s64 %rd6, %rd1, %rd5; ld.f32 %f1, [%rd6]; cvt.s64.s32 %rd7, %r1; shl.b64 %rd8, %rd7, 2; add.s64 %rd9, %rd2, %rd8; ld.f32 %f2, [%rd9]; add.f32 %f3, %f1, %f2; cvt.s64.s32 %rd10, %r1; shl.b64 %rd11, %rd10, 2; add.s64 %rd12, %rd3, %rd11; st.f32 [%rd12], %f3; tmp3: BB0_2: .loc 1 32 2 ret; tmp4: func_end0: } .file 1 ""I:/ManagedCuda/managedCuda/Samples/ManagedCudaSamples/vectorAddKernel/vectorAdd.cu"", 1477220395, 691 .section .debug_info { .b32 464 .b8 2 .b8 0 .b32 .debug_abbrev .b8 8 .b8 1 .b8 108 .b8 103 .b8 101 .b8 110 .b8 102 .b8 101 .b8 58 .b8 32 .b8 69 .b8 68 .b8 71 .b8 32 .b8 52 .b8 46 .b8 49 .b8 48 .b8 0 .b8 4 .b8 73 .b8 58 .b8 47 .b8 77 .b8 97 .b8 110 .b8 97 .b8 103 .b8 101 .b8 100 .b8 67 .b8 117 .b8 100 .b8 97 .b8 47 .b8 109 .b8 97 .b8 110 .b8 97 .b8 103 .b8 101 .b8 100 .b8 67 .b8 117 .b8 100 .b8 97 .b8 47 .b8 83 .b8 97 .b8 109 .b8 112 .b8 108 .b8 101 .b8 115 .b8 47 .b8 77 .b8 97 .b8 110 .b8 97 .b8 103 .b8 101 .b8 100 .b8 67 .b8 117 .b8 100 .b8 97 .b8 83 .b8 97 .b8 109 .b8 112 .b8 108 .b8 101 .b8 115 .b8 47 .b8 118 .b8 101 .b8 99 .b8 116 .b8 111 .b8 114 .b8 65 .b8 100 .b8 100 .b8 75 .b8 101 .b8 114 .b8 110 .b8 101 .b8 108 .b8 47 .b8 118 .b8 101 .b8 99 .b8 116 .b8 111 .b8 114 .b8 65 .b8 100 .b8 100 .b8 46 .b8 99 .b8 117 .b8 0 .b64 0 .b32 .debug_line .b8 73 .b8 58 .b8 92 .b8 77 .b8 97 .b8 110 .b8 97 .b8 103 .b8 101 .b8 100 .b8 67 .b8 117 .b8 100 .b8 97 .b8 92 .b8 109 .b8 97 .b8 110 .b8 97 .b8 103 .b8 101 .b8 100 .b8 67 .b8 117 .b8 100 .b8 97 .b8 92 .b8 83 .b8 97 .b8 109 .b8 112 .b8 108 .b8 101 .b8 115 .b8 92 .b8 77 .b8 97 .b8 110 .b8 97 .b8 103 .b8 101 .b8 100 .b8 67 .b8 117 .b8 100 .b8 97 .b8 83 .b8 97 .b8 109 .b8 112 .b8 108 .b8 101 .b8 115 .b8 92 .b8 118 .b8 101 .b8 99 .b8 116 .b8 111 .b8 114 .b8 65 .b8 100 .b8 100 .b8 75 .b8 101 .b8 114 .b8 110 .b8 101 .b8 108 .b8 0 .b8 2 .b8 86 .b8 101 .b8 99 .b8 65 .b8 100 .b8 100 .b8 0 .b8 86 .b8 101 .b8 99 .b8 65 .b8 100 .b8 100 .b8 0 .b32 1 .b32 27 .b32 422 .b8 1 .b64 func_begin0 .b64 func_end0 .b8 1 .b8 156 .b8 3 .b8 65 .b8 0 .b32 1 .b32 27 .b32 428 .b8 9 .b8 3 .b64 VecAdd_param_0 .b8 7 .b8 3 .b8 66 .b8 0 .b32 1 .b32 27 .b32 428 .b8 9 .b8 3 .b64 VecAdd_param_1 .b8 7 .b8 3 .b8 67 .b8 0 .b32 1 .b32 27 .b32 451 .b8 9 .b8 3 .b64 VecAdd_param_2 .b8 7 .b8 3 .b8 78 .b8 0 .b32 1 .b32 27 .b32 457 .b8 9 .b8 3 .b64 VecAdd_param_3 .b8 7 .b8 4 .b64 tmp0 .b64 tmp4 .b8 4 .b64 tmp0 .b64 tmp3 .b8 4 .b64 tmp0 .b64 tmp3 .b8 5 .b8 105 .b8 0 .b32 1 .b32 29 .b32 457 .b8 5 .b8 144 .b8 177 .b8 228 .b8 149 .b8 1 .b8 2 .b8 0 .b8 0 .b8 0 .b8 0 .b8 6 .b8 118 .b8 111 .b8 105 .b8 100 .b8 0 .b8 7 .b32 434 .b8 12 .b8 8 .b32 439 .b8 9 .b8 102 .b8 108 .b8 111 .b8 97 .b8 116 .b8 0 .b8 4 .b32 4 .b8 7 .b32 439 .b8 12 .b8 9 .b8 105 .b8 110 .b8 116 .b8 0 .b8 5 .b32 4 .b8 0 } .section .debug_abbrev { .b8 1 .b8 17 .b8 1 .b8 37 .b8 8 .b8 19 .b8 11 .b8 3 .b8 8 .b8 17 .b8 1 .b8 16 .b8 6 .b8 27 .b8 8 .b8 0 .b8 0 .b8 2 .b8 46 .b8 1 .b8 135 .b8 64 .b8 8 .b8 3 .b8 8 .b8 58 .b8 6 .b8 59 .b8 6 .b8 73 .b8 19 .b8 63 .b8 12 .b8 17 .b8 1 .b8 18 .b8 1 .b8 64 .b8 10 .b8 0 .b8 0 .b8 3 .b8 5 .b8 0 .b8 3 .b8 8 .b8 58 .b8 6 .b8 59 .b8 6 .b8 73 .b8 19 .b8 2 .b8 10 .b8 51 .b8 11 .b8 0 .b8 0 .b8 4 .b8 11 .b8 1 .b8 17 .b8 1 .b8 18 .b8 1 .b8 0 .b8 0 .b8 5 .b8 52 .b8 0 .b8 3 .b8 8 .b8 58 .b8 6 .b8 59 .b8 6 .b8 73 .b8 19 .b8 2 .b8 10 .b8 51 .b8 11 .b8 0 .b8 0 .b8 6 .b8 59 .b8 0 .b8 3 .b8 8 .b8 0 .b8 0 .b8 7 .b8 15 .b8 0 .b8 73 .b8 19 .b8 51 .b8 11 .b8 0 .b8 0 .b8 8 .b8 38 .b8 0 .b8 73 .b8 19 .b8 0 .b8 0 .b8 9 .b8 36 .b8 0 .b8 3 .b8 8 .b8 62 .b8 11 .b8 11 .b8 6 .b8 0 .b8 0 .b8 0 } .section .debug_ranges { } .section .debug_pubnames { .b32 25 .b8 2 .b8 0 .b32 .debug_info .b32 464 .b32 195 .b8 86 .b8 101 .b8 99 .b8 65 .b8 100 .b8 100 .b8 0 .b32 0 } "; System.IO.Stream moduleImage = new MemoryStream(Encoding.UTF8.GetBytes(ptx)); CudaKernel kernel = ctx.LoadKernelPTX(moduleImage, "VecAdd"); kernel.GridDimensions = (N + 255) / 256; kernel.BlockDimensions = 256; // Allocate input vectors h_A and h_B in host memory float[] h_A = new float[N]; float[] h_B = new float[N]; // TODO: Initialize input vectors h_A, h_B System.Random random = new System.Random(); for (int i = 0; i < N; ++i) { h_A[i] = (float)random.NextDouble(); } for (int i = 0; i < N; ++i) { h_B[i] = (float)random.NextDouble(); } // Allocate vectors in device memory and copy vectors from host memory to device memory CudaDeviceVariable <float> d_A = h_A; CudaDeviceVariable <float> d_B = h_B; CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(N); // Invoke kernel kernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N); // Copy result from device memory to host memory // h_C contains the result in host memory float[] h_C = d_C; for (int i = 0; i < 4; ++i) { System.Console.WriteLine(h_C[i]); } }
public void Compile() { using (var ctx = new CudaContext()) { // with verbaim string @, we only have to double up double quotes: no other escaping string source = @" extern ""C"" __global__ void saxpy(float a, float *x, float *y, float *out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { out[tid] = a * x[tid] + y[tid]; } } "; source += Environment.NewLine; var name = "Test"; var headers = new string[0]; var includeNames = new string[0]; var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames); //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames); // --ptxas-options=-v -keep compiler.Compile(new string[] { "-G" }); //var ptxString = compiler.GetPTXAsString(); // for debugging var ptx = compiler.GetPTX(); //compiler2.Compile(new string[] { }); var kernel = ctx.LoadKernelPTX(ptx, "kernelName"); //One kernel per cu file: //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname"); kernel.GridDimensions = new dim3(1, 1, 1); kernel.BlockDimensions = new dim3(16, 16); //kernel.Run() var a = new CudaDeviceVariable<double>(100); //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr() //Multiple kernels per cu file: CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx"); CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; } }
static void Main(string[] args) { ShrQATest.shrQAStart(args); Console.WriteLine("Vector Addition"); int N = 50000; //Init Cuda context ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "vectorAdd_x64.ptx"; } else { resName = "vectorAdd.ptx"; } string resNamespace = "vectorAdd"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } CudaKernel vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd"); // Allocate input vectors h_A and h_B in host memory h_A = new float[N]; h_B = new float[N]; // Initialize input vectors RandomInit(h_A, N); RandomInit(h_B, N); // Allocate vectors in device memory and copy vectors from host memory to device memory // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation. d_A = h_A; d_B = h_B; d_C = new CudaDeviceVariable <float>(N); // Invoke kernel int threadsPerBlock = 256; vectorAddKernel.BlockDimensions = threadsPerBlock; vectorAddKernel.GridDimensions = (N + threadsPerBlock - 1) / threadsPerBlock; vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N); // Copy result from device memory to host memory // h_C contains the result in host memory h_C = d_C; // Verify result int i; for (i = 0; i < N; ++i) { float sum = h_A[i] + h_B[i]; if (Math.Abs(h_C[i] - sum) > 1e-5) { break; } } CleanupResources(); ShrQATest.shrQAFinishExit(args, i == N ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
public void Compile() { using (var ctx = new CudaContext()) { // with verbaim string @, we only have to double up double quotes: no other escaping string source = @" extern ""C"" __global__ void saxpy(float a, float *x, float *y, float *out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { out[tid] = a * x[tid] + y[tid]; } } "; source += Environment.NewLine; var name = "Test"; var headers = new string[0]; var includeNames = new string[0]; var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames); //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames); // --ptxas-options=-v -keep compiler.Compile(new string[] { "-G" }); //var ptxString = compiler.GetPTXAsString(); // for debugging var ptx = compiler.GetPTX(); //compiler2.Compile(new string[] { }); var kernel = ctx.LoadKernelPTX(ptx, "kernelName"); //One kernel per cu file: //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname"); kernel.GridDimensions = new dim3(1, 1, 1); kernel.BlockDimensions = new dim3(16, 16); //kernel.Run() var a = new CudaDeviceVariable <double>(100); //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr() //Multiple kernels per cu file: CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx"); CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; } }
static void Main(string[] args) { int SIGNAL_SIZE = 50; int FILTER_KERNEL_SIZE = 11; Console.WriteLine("[simpleCUFFT] is starting..."); var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx"; CudaContext ctx = new CudaContext(0); CudaKernel ComplexPointwiseMulAndScale; string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale"); } // Allocate host memory for the signal cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code... Random rand = new Random(0); // Initialize the memory for the signal for (int i = 0; i < SIGNAL_SIZE; ++i) { h_signal[i].real = (float)rand.NextDouble(); h_signal[i].imag = 0; } // Allocate host memory for the filter cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE]; // Initialize the memory for the filter for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { h_filter_kernel[i].real = (float)rand.NextDouble(); h_filter_kernel[i].imag = 0; } // Pad signal and filter kernel cuFloatComplex[] h_padded_signal = null; cuFloatComplex[] h_padded_filter_kernel = null; int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE, h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE); int mem_size = (int)cuFloatComplex.SizeOf * new_size; // Allocate device memory for signal CudaDeviceVariable <cuFloatComplex> d_signal = new CudaDeviceVariable <cuFloatComplex>(new_size); // Copy host memory to device d_signal.CopyToDevice(h_padded_signal); // Allocate device memory for filter kernel CudaDeviceVariable <cuFloatComplex> d_filter_kernel = new CudaDeviceVariable <cuFloatComplex>(new_size); // Copy host memory to device d_filter_kernel.CopyToDevice(h_padded_filter_kernel); // CUFFT plan simple API CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1); // Transform signal and kernel Console.WriteLine("Transforming signal cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Forward); plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward); // Multiply the coefficients together and normalize the result Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>"); ComplexPointwiseMulAndScale.BlockDimensions = 256; ComplexPointwiseMulAndScale.GridDimensions = 32; ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size); // Transform signal back Console.WriteLine("Transforming signal back cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse); // Copy device memory to host cuFloatComplex[] h_convolved_signal = d_signal; // Allocate host memory for the convolution result cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE]; // Convolve on the host Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); // check result bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f); //Destroy CUFFT context plan.Dispose(); // cleanup memory d_filter_kernel.Dispose(); d_signal.Dispose(); ctx.Dispose(); if (bTestResult) { Console.WriteLine("Test Passed"); } else { Console.WriteLine("Test Failed"); } }
public KernelTests() { ctx = new CudaContext(); // we must call this first int Len = 100000; var A = new SyncVariable <float3>(GenRandomVectors(Len)); var B = new SyncVariable <float3>(GenRandomVectors(Len)); var C = new SyncVariable <float3>(Len); var D = new SyncVariable <float>(Len); var Length = new SyncVariable <int>(new int[] { Len }); // instead of an int use an int array of length 1 var TrisCpu = new FEA.Mesher.TriangleSTL[Len]; var rng = new Random(); for (int i = 0; i < Len; i++) { TrisCpu[i].Vertex1.x = (float)rng.NextDouble(); TrisCpu[i].Vertex1.y = (float)rng.NextDouble(); TrisCpu[i].Vertex1.z = (float)rng.NextDouble(); TrisCpu[i].Vertex2.x = (float)rng.NextDouble(); TrisCpu[i].Vertex2.y = (float)rng.NextDouble(); TrisCpu[i].Vertex2.z = (float)rng.NextDouble(); TrisCpu[i].Vertex3.x = (float)rng.NextDouble(); TrisCpu[i].Vertex3.y = (float)rng.NextDouble(); TrisCpu[i].Vertex3.z = (float)rng.NextDouble(); } var Tris = new SyncVariable <FEA.Mesher.TriangleSTL>(TrisCpu); var PtxFile = "KernelUnitTests.ptx"; var CrossProdKernel = ctx.LoadKernelPTX(PtxFile, "TestCrossProduct"); var AddKernel = ctx.LoadKernelPTX(PtxFile, "TestAdd"); var SubKernel = ctx.LoadKernelPTX(PtxFile, "TestSubtract"); var DotKernel = ctx.LoadKernelPTX(PtxFile, "TestDotProduct"); var AreaKernel = ctx.LoadKernelPTX(PtxFile, "TestTriangleArea"); var IntersectionKernel = ctx.LoadKernelPTX(PtxFile, "TestPlaneIntersection"); var BlockDims = new dim3(512); var GridDims = new dim3(Len / 512 + 1); CrossProdKernel.BlockDimensions = BlockDims; CrossProdKernel.GridDimensions = GridDims; AddKernel.BlockDimensions = BlockDims; AddKernel.GridDimensions = GridDims; SubKernel.BlockDimensions = BlockDims; SubKernel.GridDimensions = GridDims; DotKernel.BlockDimensions = BlockDims; DotKernel.GridDimensions = GridDims; AreaKernel.BlockDimensions = BlockDims; AreaKernel.GridDimensions = GridDims; CrossProdKernel.Run(Len, A.GPUPtr(), B.GPUPtr(), C.GPUPtr()); A.Sync(); B.Sync(); C.Sync(); float eps = 1e-7f; for (int i = 0; i < Len; i++) { var Ans = A.cpuArray[i].Cross(B.cpuArray[i]) - C.cpuArray[i]; if (Ans.Length >= eps) { throw new Exception("Test Failed"); } } AddKernel.Run(Len, A.GPUPtr(), B.GPUPtr(), C.GPUPtr()); A.Sync(); B.Sync(); C.Sync(); for (int i = 0; i < Len; i++) { var Ans = A.cpuArray[i] + B.cpuArray[i] - C.cpuArray[i]; if (Ans.Length >= eps) { throw new Exception("Test Failed"); } } SubKernel.Run(Len, A.GPUPtr(), B.GPUPtr(), C.GPUPtr()); A.Sync(); B.Sync(); C.Sync(); for (int i = 0; i < Len; i++) { var Ans = A.cpuArray[i] - B.cpuArray[i] - C.cpuArray[i]; if (Ans.Length >= eps) { throw new Exception("Test Failed"); } } DotKernel.Run(Len, A.GPUPtr(), B.GPUPtr(), D.GPUPtr()); A.Sync(); B.Sync(); D.Sync(); for (int i = 0; i < Len; i++) { float Ans = A.cpuArray[i].Dot(B.cpuArray[i]) - D.cpuArray[i]; if (Ans >= 3 * eps) { throw new Exception("Test Failed"); } } AreaKernel.Run(Len, Tris.GPUPtr(), D.GPUPtr()); Tris.Sync(); D.Sync(); for (int i = 0; i < Len; i++) { float ans = D.cpuArray[i]; } }
static void Main(string[] args) { const int nx = 2048; const int ny = 2048; // shifts applied to x and y data const int x_shift = 5; const int y_shift = 7; ShrQATest.shrQAStart(args); if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) { Console.Write("nx and ny must be multiples of TILE_DIM\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_WAIVED); } // execution configuration parameters dim3 grid = new dim3(nx / TILE_DIM, ny / TILE_DIM, 1); dim3 threads = new dim3(TILE_DIM, TILE_DIM, 1); // This will pick the best possible CUDA capable device int devID = findCudaDevice(args); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "simplePitchLinearTexture_x64.ptx"; } else { resName = "simplePitchLinearTexture.ptx"; } string resNamespace = "simplePitchLinearTexture"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } byte[] kernels = new byte[stream.Length]; int bytesToRead = (int)stream.Length; while (bytesToRead > 0) { bytesToRead -= stream.Read(kernels, (int)stream.Position, bytesToRead); } CudaKernel PLKernel = ctx.LoadKernelPTX(kernels, "shiftPitchLinear"); CudaKernel ArrayKernel = ctx.LoadKernelPTX(kernels, "shiftArray"); CudaStopWatch stopwatch = new CudaStopWatch(); // ---------------------------------- // Host allocation and initialization // ---------------------------------- float[] h_idata = new float[nx * ny]; float[] h_odata = new float[nx * ny]; float[] gold = new float[nx * ny]; for (int i = 0; i < nx * ny; ++i) { h_idata[i] = (float)i; } // ------------------------ // Device memory allocation // ------------------------ // Pitch linear input data CudaPitchedDeviceVariable <float> d_idataPL = new CudaPitchedDeviceVariable <float>(nx, ny); // Array input data CudaArray2D d_idataArray = new CudaArray2D(CUArrayFormat.Float, nx, ny, CudaArray2DNumChannels.One); // Pitch linear output data CudaPitchedDeviceVariable <float> d_odata = new CudaPitchedDeviceVariable <float>(nx, ny); // ------------------------ // copy host data to device // ------------------------ // Pitch linear d_idataPL.CopyToDevice(h_idata); // Array d_idataArray.CopyFromHostToThis <float>(h_idata); // ---------------------- // Bind texture to memory // ---------------------- // Pitch linear CudaTextureLinearPitched2D <float> texRefPL = new CudaTextureLinearPitched2D <float>(PLKernel, "texRefPL", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, CUArrayFormat.Float, d_idataPL); CudaTextureArray2D texRefArray = new CudaTextureArray2D(ArrayKernel, "texRefArray", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, d_idataArray); // --------------------- // reference calculation // --------------------- for (int j = 0; j < ny; j++) { int jshift = (j + y_shift) % ny; for (int i = 0; i < nx; i++) { int ishift = (i + x_shift) % nx; gold[j * nx + i] = h_idata[jshift * nx + ishift]; } } // ---------------- // shiftPitchLinear // ---------------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); PLKernel.BlockDimensions = threads; PLKernel.GridDimensions = grid; stopwatch.Start(); for (int i = 0; i < NUM_REPS; i++) { PLKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timePL = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); bool res = cutComparef(gold, h_odata); bool success = true; if (res == false) { Console.Write("*** shiftPitchLinear failed ***\n"); success = false; } // ---------- // shiftArray // ---------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); ArrayKernel.BlockDimensions = threads; ArrayKernel.GridDimensions = grid; stopwatch.Start(); for (int i = 0; i < NUM_REPS; i++) { ArrayKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timeArray = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); res = cutComparef(gold, h_odata); if (res == false) { Console.Write("*** shiftArray failed ***\n"); success = false; } float bandwidthPL = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timePL / NUM_REPS); float bandwidthArray = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timeArray / NUM_REPS); Console.Write("\nBandwidth (GB/s) for pitch linear: {0}; for array: {1}\n", bandwidthPL, bandwidthArray); float fetchRatePL = nx * ny / 1e+6f / (timePL / (1000.0f * NUM_REPS)); float fetchRateArray = nx * ny / 1e+6f / (timeArray / (1000.0f * NUM_REPS)); Console.Write("\nTexture fetch rate (Mpix/s) for pitch linear: {0}; for array: {1}\n\n", fetchRatePL, fetchRateArray); // cleanup texRefPL.Dispose(); texRefArray.Dispose(); d_idataPL.Dispose(); d_idataArray.Dispose(); d_odata.Dispose(); stopwatch.Dispose(); ctx.Dispose(); ShrQATest.shrQAFinishExit(args, (success == true) ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
public float3[] GetPointsGPU(int NumPoints) { int BlockSize = 512; if (NumPoints % BlockSize != 0) { throw new Exception("NumPoints must be divisible by " + BlockSize.ToString()); } int[] TriangleCounts = new int[GridCount + 1]; var Maxima = new float3[GridCount]; var Minima = new float3[GridCount]; TriangleCounts[0] = 0; for (int i = 0; i < GridCount; i++) { int LocalCount = TriangleCounts[i] + (int)Domains[i].TriangleCount; if (Domains[i].TriangleCount > BlockSize) { throw new Exception("STL File must have no more than " + BlockSize.ToString() + " Triangles"); } TriangleCounts[i + 1] = LocalCount; Minima[i] = STLReader.ToFloat3(Domains[i].Extrema.Min); Maxima[i] = STLReader.ToFloat3(Domains[i].Extrema.Max); } var Triangles = new TriangleSTL[TriangleCounts[GridCount]]; int id = 0; for (int i = 0; i < GridCount; i++) { for (int j = 0; j < TriangleCounts[i]; j++) { var LocalTri = Domains[i].Triangles[j]; Triangles[id] = new TriangleSTL(LocalTri); id++; } } var ctx = new CudaContext(1); var DeviceInfo = ctx.GetDeviceInfo(); var d_Triangles = new CudaDeviceVariable <TriangleSTL>(Triangles.Length); var d_TriangleCounts = new CudaDeviceVariable <int>(GridCount); var d_Minima = new CudaDeviceVariable <float3>(GridCount); var d_Maxima = new CudaDeviceVariable <float3>(GridCount); var d_Points = new CudaDeviceVariable <float3>(GridCount * NumPoints); var h_Points = new float3[GridCount * NumPoints]; var rng = new Random(0); // use a sequence that is repeatable over and over again for (int i = 0; i < GridCount * NumPoints; i++) { h_Points[i].x = (float)rng.NextDouble(); h_Points[i].y = (float)rng.NextDouble(); h_Points[i].z = (float)rng.NextDouble(); } int ctr = 0; for (int i = 0; i < GridCount; i++) { for (int j = 0; j < NumPoints; j++) { h_Points[ctr].x = Minima[i].x + h_Points[ctr].x * (Maxima[i].x - Minima[i].x); h_Points[ctr].y = Minima[i].y + h_Points[ctr].y * (Maxima[i].y - Minima[i].y); h_Points[ctr].z = Minima[i].z + h_Points[ctr].z * (Maxima[i].z - Minima[i].z); ctr++; } } d_Points = h_Points; d_Triangles = Triangles; d_TriangleCounts = TriangleCounts; d_Minima = Minima; d_Maxima = Maxima; // copy over to host // TODO generate grid on GPU instead of CPU var PointInPolygonKernel = ctx.LoadKernelPTX("PointInPolygon.ptx", "PointInPolygon"); var BlockDim = new dim3(BlockSize, 1, 1); var GridDim = new dim3(GridCount, 1, 1); PointInPolygonKernel.BlockDimensions = BlockDim; PointInPolygonKernel.GridDimensions = GridDim; PointInPolygonKernel.Run(GridCount, NumPoints, d_TriangleCounts.DevicePointer, d_Triangles.DevicePointer, d_Maxima.DevicePointer, d_Minima.DevicePointer, d_Points.DevicePointer); h_Points = d_Points; return(h_Points); // TODO Fix this to remove bad points }
static void Main(string[] args) { string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder... string fileToCompile = File.ReadAllText(filename); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel"); rtc.Compile(args); string log = rtc.GetLogAsString(); Console.WriteLine(log); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); CudaContext ctx = new CudaContext(0); CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd"); // Print the vector length to be used, and compute its size int numElements = 50000; SizeT size = numElements * sizeof(float); Console.WriteLine("[Vector addition of {0} elements]", numElements); // Allocate the host input vector A float[] h_A = new float[numElements]; // Allocate the host input vector B float[] h_B = new float[numElements]; // Allocate the host output vector C float[] h_C = new float[numElements]; Random rand = new Random(0); // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); } Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n"); // Allocate the device input vector A and copy to device CudaDeviceVariable <float> d_A = h_A; // Allocate the device input vector B and copy to device CudaDeviceVariable <float> d_B = h_B; // Allocate the device output vector C CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(numElements); // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock); vectorAdd.BlockDimensions = new dim3(threadsPerBlock, 1, 1); vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1); vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements); // Copy the device result vector in device memory to the host result vector // in host memory. Console.WriteLine("Copy output data from the CUDA device to the host memory\n"); d_C.CopyToHost(h_C); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { Console.WriteLine("Result verification failed at element {0}!\n", i); return; } } Console.WriteLine("Test PASSED\n"); // Free device global memory d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); ctx.Dispose(); Console.WriteLine("Done\n"); }
static void Main(string[] args) { try { if (args.Length == 1 && args[0].ToLower().Contains("fidelity")) { string[] fseg = args[0].Split(':'); deviceID = int.Parse(fseg[1]); nonce = Int64.Parse(fseg[2]) - 1; range = int.Parse(fseg[3]); QTEST = true; } else { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error: " + ex.Message); } try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, /*!fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) :*/ CUCtxFlags.MapHost); string pow = new StreamReader(resourceStream).ReadToEnd(); //pow = File.ReadAllText(@"kernel_x64.ptx"); Turing = ctx.GetDeviceInfo().MaxSharedMemoryPerMultiprocessor == 65536; using (var s = GenerateStreamFromString(pow)) { if (!Turing) { meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 }); meanSeedA.BlockDimensions = 512; meanSeedA.GridDimensions = 1024; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(s, "FluffyRound_A2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 }); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_A1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRound_4.BlockDimensions = 1024; meanRound_4.GridDimensions = 1024; meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_A3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRoundJoin.BlockDimensions = 1024; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(s, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } else { meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 }); meanSeedA.BlockDimensions = 512; meanSeedA.GridDimensions = 1024; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(s, "FluffyRound_C2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRound.BlockDimensions = 1024; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_C1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 }); meanRound_4.BlockDimensions = 1024; meanRound_4.GridDimensions = 1024; meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_C3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRoundJoin.BlockDimensions = 1024; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(s, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32 * (temp ? 8 : 1)); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 2)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE); d_aux = new CudaDeviceVariable <uint>(512); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Mem alloc exception. Out of video memory? {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && ++loopCnt >= range) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[32]; d_indexesB.CopyToHost(s.nonces, 0, 0, 32 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); //fidelity = (32-cycles_found / graphs_searched) * 32 solutions++; s.fidelity = ((double)solutions / (double)trims) * 32.0; //Console.WriteLine(s.fidelity.ToString("0.000")); if (Comms.IsConnected()) { Comms.graphSolutionsOut.Enqueue(s); Comms.SetEvent(); } if (QTEST) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine($"Solution for nonce {s.job.nonce}: {string.Join(' ', s.nonces)}"); Console.ResetColor(); } } if (QTEST) { currentJob = currentJob.NextSequential(ref nonce); Console.WriteLine($"Nonce: {nonce} K0: {currentJob.k0:X} K1: {currentJob.k1:X} K2: {currentJob.k2:X} K3: {currentJob.k3:X}"); } else { currentJob = currentJob.Next(); } Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); d_aux.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer, 0); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 1, d_indexesB.DevicePointer + (4096 * 4), EDGE_SEG); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 2, d_indexesB.DevicePointer + (4096 * 8), EDGE_SEG * 2); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 3, d_indexesB.DevicePointer + (4096 * 12), EDGE_SEG * 3); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 0); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 1024); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 2048); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 3072); //streamPrimary.Synchronize(); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); //streamPrimary.Synchronize(); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 0, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 1, d_aux.DevicePointer); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 2, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4, 3, d_aux.DevicePointer); for (int i = 0; i < (TEST ? 80 : trimRounds); i++) //for (int i = 0; i < 85; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 4, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 5, d_aux.DevicePointer); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); Task.Delay((int)lastTrimMs).Wait(); streamPrimary.Synchronize(); uint[] count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 131071) { // trouble count[0] = 131071; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); trims++; timer.Stop(); lastTrimMs = (long)Math.Min(Math.Max((float)timer.ElapsedMilliseconds * 0.9f, 50), 500); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0], deviceID)); FinderBag.RunFinder(TEST, ref trims, count[0], h_a, currentJob, graphSolutions, timer); if (trims % 50 == 0 && TEST) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } /* * if (TEST) * { * //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); * * CGraph cg = FinderBag.GetFinder(); * cg.SetEdges(h_a, (int)count[0]); * cg.SetHeader(currentJob); * * //currentJob = currentJob.Next(); * * Task.Factory.StartNew(() => * { * Stopwatch sw = new Stopwatch(); * sw.Start(); * * if (count[0] < 131071) * { * try * { * if (findersInFlight++ < 3) * { * Stopwatch cycleTime = new Stopwatch(); * cycleTime.Start(); * cg.FindSolutions(graphSolutions); * cycleTime.Stop(); * AdjustTrims(cycleTime.ElapsedMilliseconds); * //if (graphSolutions.Count > 0) solutions++; * } * else * Logger.Log(LogLevel.Warning, "CPU overloaded!"); * } * catch (Exception ex) * { * Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); * } * finally * { * FinderBag.ReturnFinder(cg); * findersInFlight--; * } * } * * sw.Stop(); * * if (trims % 50 == 0) * { * Console.ForegroundColor = ConsoleColor.Green; * Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims/solutions ); * Console.ResetColor(); * } * //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); * //Console.WriteLine("Duped edges: {0}", cg.dupes); * if (!QTEST) * Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); * }); * * //h_indexesA = d_indexesA; * //h_indexesB = d_indexesB; * * //var sumA = h_indexesA.Sum(e => e); * //var sumB = h_indexesB.Sum(e => e); * * ; * } * else * { * CGraph cg = FinderBag.GetFinder(); * cg.SetEdges(h_a, (int)count[0]); * cg.SetHeader(currentJob); * * Task.Factory.StartNew(() => * { * if (count[0] < 131071) * { * try * { * if (findersInFlight++ < 3) * { * Stopwatch cycleTime = new Stopwatch(); * cycleTime.Start(); * cg.FindSolutions(graphSolutions); * cycleTime.Stop(); * AdjustTrims(cycleTime.ElapsedMilliseconds); * } * else * Logger.Log(LogLevel.Warning, "CPU overloaded!"); * } * catch (Exception ex) * { * Logger.Log(LogLevel.Warning, "Cycle finder crashed: " + ex.Message); * } * finally * { * FinderBag.ReturnFinder(cg); * findersInFlight--; * } * } * }); * } * */ } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(500).Wait(); break; } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); d_aux.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }
static void Main(string[] args) { try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, !fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) : CUCtxFlags.MapHost); meanSeedA = ctx.LoadKernelPTX(resourceStream, "FluffySeed2A"); meanSeedA.BlockDimensions = 128; meanSeedA.GridDimensions = 2048; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB.BlockDimensions = 128; meanSeedB.GridDimensions = 2048; meanSeedB.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB_4 = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB_4.BlockDimensions = 128; meanSeedB_4.GridDimensions = 1024; meanSeedB_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_2 = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound_2.BlockDimensions = 512; meanRound_2.GridDimensions = 2048; meanRound_2.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(resourceStream, "FluffyRound_J"); meanRoundJoin.BlockDimensions = 512; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(resourceStream, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(resourceStream, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_A * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); streamSecondary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Out of video memory! Only {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && loopCnt++ > 100) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[40]; d_indexesB.CopyToHost(s.nonces, 0, 0, 40 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); lock (Comms.graphSolutionsOut) { Comms.graphSolutionsOut.Enqueue(s); } Comms.SetEvent(); } uint[] count; do { if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } currentJob = currentJob.Next(); Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 0); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 16); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 32); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 48); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_bufferB.DevicePointer, d_indexesA.DevicePointer + (2048 * 4), d_indexesB.DevicePointer + (4096 * 4), DUCK_EDGES_A, DUCK_EDGES_B / 2); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_A, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); //d_indexesA.MemsetAsync(0, streamPrimary.Stream); //meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4); for (int i = 0; i < trimRounds; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); ctx.Synchronize(); streamPrimary.Synchronize(); count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 4194304) { // trouble count[0] = 4194304; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); timer.Stop(); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges, h {3}", timer.ElapsedMilliseconds, count[0], deviceID, currentJob.height)); }while((currentJob.height != Comms.nextJob.height) && (!Comms.IsTerminated) && (!TEST)); if (TEST) { //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); CGraph cg = FinderBag.GetFinder(); if (cg == null) { continue; } cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); //currentJob = currentJob.Next(); Task.Factory.StartNew(() => { Stopwatch sw = new Stopwatch(); sw.Start(); if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } sw.Stop(); if (++trims % 50 == 0) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); //Console.WriteLine("Duped edges: {0}", cg.dupes); Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); }); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); ; } else { CGraph cg = FinderBag.GetFinder(); cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); Task.Factory.StartNew(() => { if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder crashed: " + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } }); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(5000).Wait(); } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }
public static void Execute() { Console.WriteLine("Barycentric stuff"); //Init Cuda context ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); //Load Kernel image from resources string resName = "baryTest.ptx"; string resNamespace = "TestManagedCuda"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } CudaKernel baryKernel = ctx.LoadKernelPTX(stream, "baryKernel"); framebufferSize = new int2(5, 5); // Allocate input vectors h_A and h_B in host memory h_v0 = new float3(0, 1, 0); h_v1 = new float3(1, -1, 0); h_v2 = new float3(-1, -1, 0); h_da = 3; h_db = 2; h_dc = 1; h_dOut = new float[framebufferSize.x * framebufferSize.y]; h_width = framebufferSize.x; h_height = framebufferSize.y; // Allocate vectors in device memory and copy vectors from host memory to device memory // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation. dev_v0 = h_v0; dev_v1 = h_v1; dev_v2 = h_v2; dev_da = h_da; dev_db = h_db; dev_dc = h_dc; dev_dOut = new CudaDeviceVariable <float>(framebufferSize.x * framebufferSize.y); dev_width = h_width; dev_height = h_height; // Invoke kernel //int threadsPerBlock = 256; //vectorAddKernel.BlockDimensions = threadsPerBlock; //vectorAddKernel.GridDimensions = (framebufferSize.x + threadsPerBlock - 1) / threadsPerBlock; dim3 windowSize = new dim3(framebufferSize.x, framebufferSize.y); dim3 blockSize = new dim3(16, 16, 1); dim3 gridSize = new dim3(windowSize.x / blockSize.x + 1, windowSize.y / blockSize.y + 1); baryKernel.BlockDimensions = blockSize; baryKernel.GridDimensions = gridSize; baryKernel.Run(dev_v0.DevicePointer, dev_v1.DevicePointer, dev_v2.DevicePointer, dev_da.DevicePointer, dev_db.DevicePointer, dev_dc.DevicePointer, dev_dOut.DevicePointer, dev_width.DevicePointer, dev_height.DevicePointer); // Copy result from device memory to host memory // h_C contains the result in host memory h_dOut = dev_dOut; CleanupResources(); Console.Write("{\n"); for (int y = 0; y < framebufferSize.y; y++) { Console.Write(" {"); for (int x = 0; x < framebufferSize.x; x++) { Console.Write(h_dOut[x + y * framebufferSize.y] + "|"); } Console.Write("}\n"); } Console.Write("}\n"); Console.ReadKey(); }
static void Main(string[] args) { int SIGNAL_SIZE = 50; int FILTER_KERNEL_SIZE = 11; Console.WriteLine("[simpleCUFFT] is starting..."); var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx"; CudaContext ctx = new CudaContext(0); CudaKernel ComplexPointwiseMulAndScale; string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale"); } // Allocate host memory for the signal cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code... Random rand = new Random(0); // Initialize the memory for the signal for (int i = 0; i < SIGNAL_SIZE; ++i) { h_signal[i].real = (float)rand.NextDouble(); h_signal[i].imag = 0; } // Allocate host memory for the filter cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE]; // Initialize the memory for the filter for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { h_filter_kernel[i].real = (float)rand.NextDouble(); h_filter_kernel[i].imag = 0; } // Pad signal and filter kernel cuFloatComplex[] h_padded_signal = null; cuFloatComplex[] h_padded_filter_kernel = null; int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE, h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE); int mem_size = (int)cuFloatComplex.SizeOf * new_size; // Allocate device memory for signal CudaDeviceVariable<cuFloatComplex> d_signal = new CudaDeviceVariable<cuFloatComplex>(new_size); // Copy host memory to device d_signal.CopyToDevice(h_padded_signal); // Allocate device memory for filter kernel CudaDeviceVariable<cuFloatComplex> d_filter_kernel = new CudaDeviceVariable<cuFloatComplex>(new_size); // Copy host memory to device d_filter_kernel.CopyToDevice(h_padded_filter_kernel); // CUFFT plan simple API CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1); // Transform signal and kernel Console.WriteLine("Transforming signal cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Forward); plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward); // Multiply the coefficients together and normalize the result Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>"); ComplexPointwiseMulAndScale.BlockDimensions = 256; ComplexPointwiseMulAndScale.GridDimensions = 32; ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size); // Transform signal back Console.WriteLine("Transforming signal back cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse); // Copy device memory to host cuFloatComplex[] h_convolved_signal = d_signal; // Allocate host memory for the convolution result cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE]; // Convolve on the host Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); // check result bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f); //Destroy CUFFT context plan.Dispose(); // cleanup memory d_filter_kernel.Dispose(); d_signal.Dispose(); ctx.Dispose(); if (bTestResult) { Console.WriteLine("Test Passed"); } else { Console.WriteLine("Test Failed"); } }
static void Main(string[] args) { ShrQATest.shrQAStart(args); Console.WriteLine("Vector Addition"); int N = 50000; //Init Cuda context ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) resName = "vectorAdd_x64.ptx"; else resName = "vectorAdd.ptx"; string resNamespace = "vectorAdd"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) throw new ArgumentException("Kernel not found in resources."); CudaKernel vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd"); // Allocate input vectors h_A and h_B in host memory h_A = new float[N]; h_B = new float[N]; // Initialize input vectors RandomInit(h_A, N); RandomInit(h_B, N); // Allocate vectors in device memory and copy vectors from host memory to device memory // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation. d_A = h_A; d_B = h_B; d_C = new CudaDeviceVariable<float>(N); // Invoke kernel int threadsPerBlock = 256; vectorAddKernel.BlockDimensions = threadsPerBlock; vectorAddKernel.GridDimensions = (N + threadsPerBlock - 1) / threadsPerBlock; vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N); // Copy result from device memory to host memory // h_C contains the result in host memory h_C = d_C; // Verify result int i; for (i = 0; i < N; ++i) { float sum = h_A[i] + h_B[i]; if (Math.Abs(h_C[i] - sum) > 1e-5) break; } CleanupResources(); ShrQATest.shrQAFinishExit(args, i == N ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
static void Main(string[] args) { string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder... string fileToCompile = File.ReadAllText(filename); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel"); rtc.Compile(args); string log = rtc.GetLogAsString(); Console.WriteLine(log); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); CudaContext ctx = new CudaContext(0); CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd"); // Print the vector length to be used, and compute its size int numElements = 50000; SizeT size = numElements * sizeof(float); Console.WriteLine("[Vector addition of {0} elements]", numElements); // Allocate the host input vector A float[] h_A = new float[numElements]; // Allocate the host input vector B float[] h_B = new float[numElements]; // Allocate the host output vector C float[] h_C = new float[numElements]; Random rand = new Random(0); // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); } Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n"); // Allocate the device input vector A and copy to device CudaDeviceVariable<float> d_A = h_A; // Allocate the device input vector B and copy to device CudaDeviceVariable<float> d_B = h_B; // Allocate the device output vector C CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(numElements); // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock); vectorAdd.BlockDimensions = new dim3(threadsPerBlock,1, 1); vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1); vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements); // Copy the device result vector in device memory to the host result vector // in host memory. Console.WriteLine("Copy output data from the CUDA device to the host memory\n"); d_C.CopyToHost(h_C); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { Console.WriteLine("Result verification failed at element {0}!\n", i); return; } } Console.WriteLine("Test PASSED\n"); // Free device global memory d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); ctx.Dispose(); Console.WriteLine("Done\n"); }
public void TestPTX() { LLVM.InitializeAllTargets(); LLVM.InitializeAllTargetMCs(); LLVM.InitializeAllTargetInfos(); LLVM.InitializeAllAsmPrinters(); ModuleRef mod = LLVM.ModuleCreateWithName("llvmptx"); var pt = LLVM.PointerType(LLVM.Int64Type(), 1); TypeRef[] param_types = { pt }; TypeRef ret_type = LLVM.FunctionType(LLVM.VoidType(), param_types, false); ValueRef sum = LLVM.AddFunction(mod, "sum", ret_type); BasicBlockRef entry = LLVM.AppendBasicBlock(sum, "entry"); BuilderRef builder = LLVM.CreateBuilder(); LLVM.PositionBuilderAtEnd(builder, entry); var v = LLVM.BuildLoad(builder, LLVM.GetParam(sum, 0), ""); ValueRef tmp = LLVM.BuildAdd(builder, v, LLVM.ConstInt(LLVM.Int64Type(), 1, false), "tmp"); LLVM.BuildStore(builder, tmp, LLVM.GetParam(sum, 0)); LLVM.BuildRetVoid(builder); MyString the_error = new MyString(); LLVM.VerifyModule(mod, VerifierFailureAction.PrintMessageAction, the_error); string triple = "nvptx64-nvidia-cuda"; TargetRef t2; var b = LLVM.GetTargetFromTriple(triple, out t2, the_error); string cpu = ""; string features = ""; TargetMachineRef tmr = LLVM.CreateTargetMachine(t2, triple, cpu, features, CodeGenOptLevel.CodeGenLevelDefault, RelocMode.RelocDefault, CodeModel.CodeModelKernel); ContextRef context_ref = LLVM.ContextCreate(); ValueRef kernelMd = LLVM.MDNodeInContext(context_ref, new ValueRef[3] { sum, LLVM.MDStringInContext(context_ref, "kernel", 6), LLVM.ConstInt(LLVM.Int32TypeInContext(context_ref), 1, false) }); LLVM.AddNamedMetadataOperand(mod, "nvvm.annotations", kernelMd); var y1 = LLVM.TargetMachineEmitToMemoryBuffer( tmr, mod, Swigged.LLVM.CodeGenFileType.AssemblyFile, the_error, out MemoryBufferRef buffer); string ptx = null; try { ptx = LLVM.GetBufferStart(buffer); uint length = LLVM.GetBufferSize(buffer); // Output the PTX assembly code. We can run this using the CUDA Driver API System.Console.WriteLine(ptx); } finally { LLVM.DisposeMemoryBuffer(buffer); } // RUN THE MF. Int64[] h_C = new Int64[100]; CudaContext ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); CudaKernel kernel = ctx.LoadKernelPTX(Encoding.ASCII.GetBytes(ptx), "sum"); var d_C = new CudaDeviceVariable <Int64>(100); int N = 1; int threadsPerBlock = 256; kernel.BlockDimensions = threadsPerBlock; kernel.GridDimensions = (N + threadsPerBlock - 1) / threadsPerBlock; kernel.Run(d_C.DevicePointer); h_C = d_C; System.Console.WriteLine("Result " + h_C[0]); if (h_C[0] != 1) { throw new Exception("Failed."); } LLVM.DumpModule(mod); LLVM.DisposeBuilder(builder); }
public static CudaKernel load_kernel(String kernelName) { byte[] ptx = prepare_kernel(kernelName); return(ctx.LoadKernelPTX(ptx, kernelName)); }