// Initialization code to find the best CUDA Device static int findCudaDevice(string[] args) { int devID = 0; // If the command-line has a device number specified, use it bool found = false; foreach (var item in args) { if (item.Contains("device=")) { found = true; if (!int.TryParse(item, out devID)) { Console.WriteLine("Invalid command line parameters"); Environment.Exit(-1); } if (devID < 0) { Console.WriteLine("Invalid command line parameters\n"); Environment.Exit(-1); } else { devID = gpuDeviceInit(devID); if (devID < 0) { Console.WriteLine("exiting...\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); Environment.Exit(-1); } } } } if (!found) { // Otherwise pick the device with highest Gflops/s devID = CudaContext.GetMaxGflopsDeviceId(); ctx = new CudaContext(devID, CUCtxFlags.SchedAuto); Console.Write("> Using CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName()); } return(devID); }
static void Main(string[] args) { int cuda_device = 0; int nstreams = 4; // number of streams for CUDA calls int nreps = 10; // number of times each experiment is repeated int n = 16 * 1024 * 1024; // number of ints in the data set int nbytes = n * sizeof(int); // number of data bytes dim3 threads, blocks; // kernel launch configuration float elapsed_time, time_memcpy, time_kernel; // timing variables float scale_factor = 1.0f; // allocate generic memory and pin it laster instead of using cudaHostAlloc() // Untested in C#, so stick to cudaHostAlloc(). bool bPinGenericMemory = false; // we want this to be the default behavior CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync int niterations; // number of iterations for the loop inside the kernel ShrQATest.shrQAStart(args); Console.WriteLine("[ simpleStreams ]"); foreach (var item in args) { if (item.Contains("help")) { printHelp(); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED); } } bPinGenericMemory = false; foreach (var item in args) { if (item.Contains("use_generic_memory")) { bPinGenericMemory = true; } } for (int i = 0; i < args.Length; i++) { if (args[i].Contains("sync_method")) { int temp = -1; bool error = false; if (i < args.Length - 1) { error = int.TryParse(args[i + 1], out temp); switch (temp) { case 0: device_sync_method = CUCtxFlags.SchedAuto; break; case 1: device_sync_method = CUCtxFlags.SchedSpin; break; case 2: device_sync_method = CUCtxFlags.SchedYield; break; case 4: device_sync_method = CUCtxFlags.BlockingSync; break; default: error = true; break; } } if (!error) { Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]); nreps = 100; } else { Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } } } int num_devices = CudaContext.GetDeviceCount(); if (0 == num_devices) { Console.Write("your system does not have a CUDA capable device, waiving test...\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } cuda_device = CudaContext.GetMaxGflopsDeviceId(); CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device); if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1)) { Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName); } if (deviceProp.ComputeCapability.Major >= 2) { niterations = 100; } else { if (deviceProp.ComputeCapability.Minor > 1) { niterations = 5; } else { niterations = 1; // reduced workload for compute capability 1.0 and 1.1 } } // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case... if (bPinGenericMemory) { Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No"); if (deviceProp.CanMapHostMemory == false) { Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n"); bPinGenericMemory = false; } } // Anything that is less than 32 Cores will have scaled down workload scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f); n = (int)Math.Round((float)n / scale_factor); Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor); Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n", deviceProp.MultiProcessorCount, ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor), ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount); Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor); Console.Write("> array_size = {0}\n\n", n); // enable use of blocking sync, to reduce CPU usage Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]); CudaContext ctx; if (bPinGenericMemory) { ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost); } else { ctx = new CudaContext(cuda_device, device_sync_method); } //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "simpleStreams_x64.ptx"; } else { resName = "simpleStreams.ptx"; } string resNamespace = "simpleStreams"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array"); // allocate host memory int c = 5; // value to which the array will be initialized int[] h_a = null; // pointer to the array data in host memory CudaPageLockedHostMemory <int> hAligned_a = null; // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT) //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a). //In C++ hAligned_a would point somewhere inside the h_a array. AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes); Console.Write("\nStarting Test\n"); // allocate device memory CudaDeviceVariable <int> d_c = c; //using new implicit cast to allocate memory and asign value CudaDeviceVariable <int> d_a = new CudaDeviceVariable <int>(nbytes / sizeof(int)); CudaStream[] streams = new CudaStream[nstreams]; for (int i = 0; i < nstreams; i++) { streams[i] = new CudaStream(); } // create CUDA event handles // use blocking sync CudaEvent start_event, stop_event; CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default); start_event = new CudaEvent(eventflags); stop_event = new CudaEvent(eventflags); // time memcopy from device start_event.Record(); // record in stream-0, to ensure that all previous CUDA calls have completed hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream); stop_event.Record(); stop_event.Synchronize(); // block until the event is actually recorded time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("memcopy:\t{0:0.00}\n", time_memcpy); // time kernel threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations); stop_event.Record(); stop_event.Synchronize(); time_kernel = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("kernel:\t\t{0:0.00}\n", time_kernel); ////////////////////////////////////////////////////////////////////// // time non-streamed execution for reference threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); for (int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations); hAligned_a.SynchronCopyToHost(d_a); } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy); ////////////////////////////////////////////////////////////////////// // time execution with nstreams streams threads = new dim3(512, 1); blocks = new dim3(n / (int)(nstreams * threads.x), 1); byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness for (int i = 0; i < nbytes; i++) { memset[i] = 255; } System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes); d_a.Memset(0); // set device memory to all 0s, for testing correctness start_event.Record(); for (int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; // asynchronously launch nstreams kernels, each operating on its own portion of data for (int i = 0; i < nstreams; i++) { init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations); } // asynchronously launch nstreams memcopies. Note that memcopy in stream x will only // commence executing when all previous CUDA calls in stream x have completed for (int i = 0; i < nstreams; i++) { hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream); } } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams); // check whether the output is correct Console.Write("-------------------------------\n"); //We can directly access data in hAligned_a using the [] operator, but copying //data first to h_a is faster. System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int)); bool bResults = correct_data(h_a, n, c * nreps * niterations); // release resources for (int i = 0; i < nstreams; i++) { streams[i].Dispose(); } start_event.Dispose(); stop_event.Dispose(); hAligned_a.Dispose(); d_a.Dispose(); d_c.Dispose(); CudaContext.ProfilerStop(); ctx.Dispose(); Console.ReadKey(); ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
static void Main(string[] args) { ShrQATest.shrQAStart(args); Console.WriteLine("Vector Addition"); int N = 50000; //Init Cuda context ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "vectorAdd_x64.ptx"; } else { resName = "vectorAdd.ptx"; } string resNamespace = "vectorAdd"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } CudaKernel vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd"); // Allocate input vectors h_A and h_B in host memory h_A = new float[N]; h_B = new float[N]; // Initialize input vectors RandomInit(h_A, N); RandomInit(h_B, N); // Allocate vectors in device memory and copy vectors from host memory to device memory // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation. d_A = h_A; d_B = h_B; d_C = new CudaDeviceVariable <float>(N); // Invoke kernel int threadsPerBlock = 256; vectorAddKernel.BlockDimensions = threadsPerBlock; vectorAddKernel.GridDimensions = (N + threadsPerBlock - 1) / threadsPerBlock; vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N); // Copy result from device memory to host memory // h_C contains the result in host memory h_C = d_C; // Verify result int i; for (i = 0; i < N; ++i) { float sum = h_A[i] + h_B[i]; if (Math.Abs(h_C[i] - sum) > 1e-5) { break; } } CleanupResources(); ShrQATest.shrQAFinishExit(args, i == N ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
static void Main(string[] args) { const int nx = 2048; const int ny = 2048; // shifts applied to x and y data const int x_shift = 5; const int y_shift = 7; ShrQATest.shrQAStart(args); if ((nx % TILE_DIM != 0) || (ny % TILE_DIM != 0)) { Console.Write("nx and ny must be multiples of TILE_DIM\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_WAIVED); } // execution configuration parameters dim3 grid = new dim3(nx / TILE_DIM, ny / TILE_DIM, 1); dim3 threads = new dim3(TILE_DIM, TILE_DIM, 1); // This will pick the best possible CUDA capable device int devID = findCudaDevice(args); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "simplePitchLinearTexture_x64.ptx"; } else { resName = "simplePitchLinearTexture.ptx"; } string resNamespace = "simplePitchLinearTexture"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } byte[] kernels = new byte[stream.Length]; int bytesToRead = (int)stream.Length; while (bytesToRead > 0) { bytesToRead -= stream.Read(kernels, (int)stream.Position, bytesToRead); } CudaKernel PLKernel = ctx.LoadKernelPTX(kernels, "shiftPitchLinear"); CudaKernel ArrayKernel = ctx.LoadKernelPTX(kernels, "shiftArray"); CudaStopWatch stopwatch = new CudaStopWatch(); // ---------------------------------- // Host allocation and initialization // ---------------------------------- float[] h_idata = new float[nx * ny]; float[] h_odata = new float[nx * ny]; float[] gold = new float[nx * ny]; for (int i = 0; i < nx * ny; ++i) { h_idata[i] = (float)i; } // ------------------------ // Device memory allocation // ------------------------ // Pitch linear input data CudaPitchedDeviceVariable <float> d_idataPL = new CudaPitchedDeviceVariable <float>(nx, ny); // Array input data CudaArray2D d_idataArray = new CudaArray2D(CUArrayFormat.Float, nx, ny, CudaArray2DNumChannels.One); // Pitch linear output data CudaPitchedDeviceVariable <float> d_odata = new CudaPitchedDeviceVariable <float>(nx, ny); // ------------------------ // copy host data to device // ------------------------ // Pitch linear d_idataPL.CopyToDevice(h_idata); // Array d_idataArray.CopyFromHostToThis <float>(h_idata); // ---------------------- // Bind texture to memory // ---------------------- // Pitch linear CudaTextureLinearPitched2D <float> texRefPL = new CudaTextureLinearPitched2D <float>(PLKernel, "texRefPL", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, CUArrayFormat.Float, d_idataPL); CudaTextureArray2D texRefArray = new CudaTextureArray2D(ArrayKernel, "texRefArray", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, d_idataArray); // --------------------- // reference calculation // --------------------- for (int j = 0; j < ny; j++) { int jshift = (j + y_shift) % ny; for (int i = 0; i < nx; i++) { int ishift = (i + x_shift) % nx; gold[j * nx + i] = h_idata[jshift * nx + ishift]; } } // ---------------- // shiftPitchLinear // ---------------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); PLKernel.BlockDimensions = threads; PLKernel.GridDimensions = grid; stopwatch.Start(); for (int i = 0; i < NUM_REPS; i++) { PLKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timePL = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); bool res = cutComparef(gold, h_odata); bool success = true; if (res == false) { Console.Write("*** shiftPitchLinear failed ***\n"); success = false; } // ---------- // shiftArray // ---------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); ArrayKernel.BlockDimensions = threads; ArrayKernel.GridDimensions = grid; stopwatch.Start(); for (int i = 0; i < NUM_REPS; i++) { ArrayKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch / sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timeArray = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); res = cutComparef(gold, h_odata); if (res == false) { Console.Write("*** shiftArray failed ***\n"); success = false; } float bandwidthPL = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timePL / NUM_REPS); float bandwidthArray = 2.0f * 1000.0f * nx * ny * sizeof(float) / (1e+9f) / (timeArray / NUM_REPS); Console.Write("\nBandwidth (GB/s) for pitch linear: {0}; for array: {1}\n", bandwidthPL, bandwidthArray); float fetchRatePL = nx * ny / 1e+6f / (timePL / (1000.0f * NUM_REPS)); float fetchRateArray = nx * ny / 1e+6f / (timeArray / (1000.0f * NUM_REPS)); Console.Write("\nTexture fetch rate (Mpix/s) for pitch linear: {0}; for array: {1}\n\n", fetchRatePL, fetchRateArray); // cleanup texRefPL.Dispose(); texRefArray.Dispose(); d_idataPL.Dispose(); d_idataArray.Dispose(); d_odata.Dispose(); stopwatch.Dispose(); ctx.Dispose(); ShrQATest.shrQAFinishExit(args, (success == true) ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }