public MyFourierBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock<float> tempBlock) : base(owner, inputSize, tempBlock) { m_stream = new CudaStream(); m_fft = new CudaFFTPlan1D(inputSize, cufftType.R2C, 1); m_fft.SetStream(m_stream.Stream); m_ifft = new CudaFFTPlan1D(inputSize, cufftType.C2R, 1); m_ifft.SetStream(m_stream.Stream); m_mulkernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "MulComplexElementWise"); m_mulkernel.SetupExecution(inputSize + 1); m_involutionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "InvolveVector"); m_involutionKernel.SetupExecution(inputSize - 1); m_inversionKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\InvertValuesKernel", "InvertLengthComplexKernel"); m_inversionKernel.SetupExecution(inputSize); m_dotKernel = MyKernelFactory.Instance.KernelProduct<float>(owner, owner.GPU, ProductMode.f_DotProduct_f); m_normalKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Transforms\TransformKernels", "PolynomialFunctionKernel"); m_normalKernel.SetupExecution(inputSize); m_firstFFTOffset = 0; m_secondFFTOffset = (inputSize + 1) * 2; m_tempOffset = (inputSize + 1) * 4; Denominator = inputSize; }
public MyPermutationBinder(MyWorkingNode owner, int inputSize, MyMemoryBlock<float> tempBlock) : base(owner, inputSize, tempBlock) { m_stream = new CudaStream(); m_binaryPermKernel = MyKernelFactory.Instance.Kernel(owner.GPU, @"Common\CombineVectorsKernel", "CombineTwoVectorsKernel"); m_binaryPermKernel.SetupExecution(inputSize); }
/// <summary> /// Launches an executable graph in a stream.<para/> /// Only one instance of GraphExec may be executing /// at a time. Each launch is ordered behind both any previous work in Stream /// and any previous launches of GraphExec.To execute a graph concurrently, it must be /// instantiated multiple times into multiple executable graphs. /// </summary> /// <param name="stream"></param> public void Launch(CudaStream stream) { res = DriverAPINativeMethods.GraphManagment.cuGraphLaunch(_graph, stream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuGraphLaunch", res)); if (res != CUResult.Success) { throw new CudaException(res); } }
private CudnnContext( CudnnHandle handle, CudaStream stream ) { if (handle.Pointer == IntPtr.Zero) throw new ArgumentException("handle"); Contract.EndContractBlock(); this.handle = handle; this.stream = stream; }
/// <summary> /// Copies attributes from source stream to destination stream<para/> /// Copies attributes from source stream \p src to destination stream \p dst.<para/> /// Both streams must have the same context. /// </summary> /// <param name="dst">Destination stream</param> public void CopyAttributes(CudaStream dst) { CUResult res = DriverAPINativeMethods.Streams.cuStreamCopyAttributes(dst.Stream, _stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuStreamCopyAttributes", res)); if (res != CUResult.Success) { throw new CudaException(res); } }
public static CudnnContext Create( CudaStream stream = null ) { CudnnHandle handle = default(CudnnHandle); Invoke(() => CudnnNativeMethods.cudnnCreate(out handle)); Contract.Assume(handle.Pointer != IntPtr.Zero); if (stream != null) { Invoke(() => CudnnNativeMethods.cudnnSetStream(handle, stream.Stream)); } return new CudnnContext(handle, stream); }
/// <summary> /// Allocates memory from a specified pool with stream ordered semantics.<para/> /// Inserts an allocation operation into \p hStream.<para/> /// A pointer to the allocated memory is returned immediately in *dptr.<para/> /// The allocation must not be accessed until the the allocation operation completes.<para/> /// The allocation comes from the specified memory pool.<para/> /// note<para/> /// - The specified memory pool may be from a device different than that of the specified \p hStream.<para/> /// - Basic stream ordering allows future work submitted into the same stream to use the allocation. /// Stream query, stream synchronize, and CUDA events can be used to guarantee that the allocation /// operation completes before work submitted in a separate stream runs. /// </summary> /// <param name="bytesize">Number of bytes to allocate</param> /// <param name="hStream">The stream establishing the stream ordering semantic</param> public CudaDeviceVariable <T> MemAllocFromPoolAsync <T>(SizeT bytesize, CudaStream hStream) where T : struct { CUdeviceptr devPtr = new CUdeviceptr(); res = DriverAPINativeMethods.MemoryManagement.cuMemAllocFromPoolAsync(ref devPtr, bytesize, _memoryPool, hStream.Stream); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemAllocFromPoolAsync", res)); if (res != CUResult.Success) { throw new CudaException(res); } return(new CudaDeviceVariable <T>(devPtr, false, bytesize)); }
public void ContextWithStream() { using ( var cuda = new CudaContext() ) using ( var stream = new CudaStream(CUStreamFlags.Default) ) { using (var context = CudnnContext.Create(stream)) { Assert.True(context.IsInitialized); var streamId = default (CUstream); CudnnContext.Invoke(() => CudnnNativeMethods.cudnnGetStream(context.Handle, out streamId)); Assert.Equal(stream.Stream, streamId); } } }
static void Main(string[] args) { int cuda_device = 0; int nstreams = 4; // number of streams for CUDA calls int nreps = 10; // number of times each experiment is repeated int n = 16 * 1024 * 1024; // number of ints in the data set int nbytes = n * sizeof(int); // number of data bytes dim3 threads, blocks; // kernel launch configuration float elapsed_time, time_memcpy, time_kernel; // timing variables float scale_factor = 1.0f; // allocate generic memory and pin it laster instead of using cudaHostAlloc() // Untested in C#, so stick to cudaHostAlloc(). bool bPinGenericMemory = false; // we want this to be the default behavior CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync int niterations; // number of iterations for the loop inside the kernel ShrQATest.shrQAStart(args); Console.WriteLine("[ simpleStreams ]"); foreach (var item in args) { if (item.Contains("help")) { printHelp(); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED); } } bPinGenericMemory = false; foreach (var item in args) { if (item.Contains("use_generic_memory")) { bPinGenericMemory = true; } } for (int i = 0; i < args.Length; i++) { if (args[i].Contains("sync_method")) { int temp = -1; bool error = false; if (i < args.Length - 1) { error = int.TryParse(args[i + 1], out temp); switch (temp) { case 0: device_sync_method = CUCtxFlags.SchedAuto; break; case 1: device_sync_method = CUCtxFlags.SchedSpin; break; case 2: device_sync_method = CUCtxFlags.SchedYield; break; case 4: device_sync_method = CUCtxFlags.BlockingSync; break; default: error = true; break; } } if (!error) { Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]); nreps = 100; } else { Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } } } int num_devices = CudaContext.GetDeviceCount(); if(0==num_devices) { Console.Write("your system does not have a CUDA capable device, waiving test...\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } cuda_device = CudaContext.GetMaxGflopsDeviceId(); CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device); if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1)) { Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName); } if (deviceProp.ComputeCapability.Major >= 2) { niterations = 100; } else { if (deviceProp.ComputeCapability.Minor > 1) { niterations = 5; } else { niterations = 1; // reduced workload for compute capability 1.0 and 1.1 } } // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case... if (bPinGenericMemory) { Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No"); if (deviceProp.CanMapHostMemory == false) { Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n"); bPinGenericMemory = false; } } // Anything that is less than 32 Cores will have scaled down workload scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f); n = (int)Math.Round((float)n / scale_factor); Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor); Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n", deviceProp.MultiProcessorCount, ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor), ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount); Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor); Console.Write("> array_size = {0}\n\n", n); // enable use of blocking sync, to reduce CPU usage Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]); CudaContext ctx; if (bPinGenericMemory) ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost); else ctx = new CudaContext(cuda_device, device_sync_method); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) resName = "simpleStreams_x64.ptx"; else resName = "simpleStreams.ptx"; string resNamespace = "simpleStreams"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) throw new ArgumentException("Kernel not found in resources."); CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array"); // allocate host memory int c = 5; // value to which the array will be initialized int[] h_a = null; // pointer to the array data in host memory CudaPageLockedHostMemory<int> hAligned_a = null; // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT) //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a). //In C++ hAligned_a would point somewhere inside the h_a array. AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes); Console.Write("\nStarting Test\n"); // allocate device memory CudaDeviceVariable<int> d_c = c; //using new implicit cast to allocate memory and asign value CudaDeviceVariable<int> d_a = new CudaDeviceVariable<int>(nbytes / sizeof(int)); CudaStream[] streams = new CudaStream[nstreams]; for (int i = 0; i < nstreams; i++) { streams[i] = new CudaStream(); } // create CUDA event handles // use blocking sync CudaEvent start_event, stop_event; CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default); start_event = new CudaEvent(eventflags); stop_event = new CudaEvent(eventflags); // time memcopy from device start_event.Record(); // record in stream-0, to ensure that all previous CUDA calls have completed hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream); stop_event.Record(); stop_event.Synchronize(); // block until the event is actually recorded time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("memcopy:\t{0:0.00}\n", time_memcpy); // time kernel threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations); stop_event.Record(); stop_event.Synchronize(); time_kernel = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("kernel:\t\t{0:0.00}\n", time_kernel); ////////////////////////////////////////////////////////////////////// // time non-streamed execution for reference threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); for(int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations); hAligned_a.SynchronCopyToHost(d_a); } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy); ////////////////////////////////////////////////////////////////////// // time execution with nstreams streams threads = new dim3(512, 1); blocks = new dim3(n / (int)(nstreams * threads.x), 1); byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness for (int i = 0; i < nbytes; i++) { memset[i] = 255; } System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes); d_a.Memset(0); // set device memory to all 0s, for testing correctness start_event.Record(); for(int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; // asynchronously launch nstreams kernels, each operating on its own portion of data for(int i = 0; i < nstreams; i++) init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations); // asynchronously launch nstreams memcopies. Note that memcopy in stream x will only // commence executing when all previous CUDA calls in stream x have completed for (int i = 0; i < nstreams; i++) hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream); } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams); // check whether the output is correct Console.Write("-------------------------------\n"); //We can directly access data in hAligned_a using the [] operator, but copying //data first to h_a is faster. System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int)); bool bResults = correct_data(h_a, n, c*nreps*niterations); // release resources for(int i = 0; i < nstreams; i++) { streams[i].Dispose(); } start_event.Dispose(); stop_event.Dispose(); hAligned_a.Dispose(); d_a.Dispose(); d_c.Dispose(); CudaContext.ProfilerStop(); ctx.Dispose(); ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }