private void initGLAndCuda() { //Create render target control m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default); m_renderControl.Dock = DockStyle.Fill; m_renderControl.BackColor = Color.White; m_renderControl.BorderStyle = BorderStyle.FixedSingle; m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown); m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove); m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown); m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged); panel1.Controls.Add(m_renderControl); Console.WriteLine(" OpenGL device is Available"); int deviceID = CudaContext.GetMaxGflopsDeviceId(); ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync); string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount); Console.WriteLine(console); CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx); hvfield = new cData[DS]; dvfield = new CudaPitchedDeviceVariable<cData>(DIM, DIM); tPitch = dvfield.Pitch; dvfield.CopyToDevice(hvfield); vxfield = new CudaDeviceVariable<cData>(DS); vyfield = new CudaDeviceVariable<cData>(DS); // Create particle array particles = new cData[DS]; initParticles(particles, DIM, DIM); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding); planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding); GL.GenBuffers(1, out vbo); GL.BindBuffer(BufferTarget.ArrayBuffer, vbo); GL.BufferData<cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw); int bsize; GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize); if (bsize != DS * cData.SizeOf) throw new Exception("Sizes don't match."); GL.BindBuffer(BufferTarget.ArrayBuffer, 0); cuda_vbo_resource = new CudaGraphicsInteropResourceCollection(); cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None)); texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); stopwatch = new CudaStopWatch(CUEventFlags.Default); reshape(); isInit = true; display(); }
// Testing getting device information via managedCuda private static void GetInformationAboutDevice() { // Number of devices var deviceCount = CudaContext.GetDeviceCount(); Console.WriteLine(deviceCount + " Devices"); if (deviceCount <= 0) { throw new Exception("No cuda device detected"); } // Pick device based on performance. var deviceByFlops = CudaContext.GetMaxGflopsDeviceId(); Console.WriteLine("Unit {0} has the most Gflops", deviceByFlops); var deviceProperties = CudaContext.GetDeviceInfo(deviceByFlops); Console.WriteLine("And has the following properties: "); Console.WriteLine(deviceProperties.DeviceName); Console.WriteLine("Can execute concurrent kernels: " + deviceProperties.ConcurrentKernels); Console.WriteLine("Multi processor count: " + deviceProperties.MultiProcessorCount); Console.WriteLine("Clockrate (mhz): " + (int)deviceProperties.ClockRate / 1000.0); Console.WriteLine("Total global memory (MB): " + deviceProperties.TotalGlobalMemory / 1000000); Console.WriteLine("Is integrated: " + deviceProperties.Integrated); Console.WriteLine("Max block dimension: " + deviceProperties.MaxGridDim); Console.WriteLine("Max block dimension: " + deviceProperties.MaxBlockDim); Console.WriteLine("Max threads per block: " + deviceProperties.MaxThreadsPerBlock); Console.WriteLine("Max threads per multiprocessor: " + deviceProperties.MaxThreadsPerMultiProcessor); Console.WriteLine("Max shared mem block can use (b): " + deviceProperties.SharedMemoryPerBlock); Console.WriteLine("If device can do mem copy and kernel execution: " + deviceProperties.GpuOverlap); Console.WriteLine("can map memory adress space on host and device: " + deviceProperties.CanMapHostMemory); }
private static dim3 GetContigReduceBlock(CudaContext cudaContext, long numSlices, long reductionSize) { // If the number of slices is low but the reduction dimension size // is high, then we should increase block size for greater parallelism. // Aim for at least 32 warps per SM (assume 15 SMs; don't bother // inquiring the real number for now). var smCount = 15; var maxWarps = 4; // better occupancy if many blocks are around // For numSlices > smCount * 8, there are > 32 warps active per SM. if (numSlices < smCount * 8) { maxWarps = 8; if (numSlices < smCount * 4) { maxWarps = 16; if (numSlices < smCount * 2) { maxWarps = 32; } } } // Scale up block size based on the reduction dimension size var warpsInReductionSize = ApplyUtils.CeilDiv(reductionSize, 32); var numWarps = warpsInReductionSize > maxWarps ? maxWarps : (int)warpsInReductionSize; var targetSize = numWarps * 32; targetSize = Math.Min(targetSize, (int)cudaContext.GetDeviceInfo().MaxBlockDim.x); return(new dim3(targetSize)); }
protected void SetupCuda() { // Try to bind a CUDA context to the graphics card that WPF is working with. Adapter d3dAdapter = Device.Factory.GetAdapter(0); CUdevice[] cudaDevices = null; try { // Build a CUDA context from the first adapter in the used D3D11 device. cudaDevices = CudaContext.GetDirectXDevices(Device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D11); Debug.Assert(cudaDevices.Length > 0); Console.WriteLine("> Display Device #" + d3dAdapter + ": \"" + d3dAdapter.Description + "\" supports Direct3D11 and CUDA.\n"); } catch (CudaException) { // No Cuda device found for this Direct3D11 device. Console.Write("> Display Device #" + d3dAdapter + ": \"" + d3dAdapter.Description + "\" supports Direct3D11 but not CUDA.\n"); } ContextCuda = new CudaContext(cudaDevices[0], Device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D11); var info = ContextCuda.GetDeviceInfo(); Console.WriteLine("Max. Nr. Threads: " + info.MaxBlockDim + ", Total: " + info.MaxThreadsPerBlock + "\nMax. Nr. Blocks: " + info.MaxGridDim + "\nMax. Bytes Shared Per Block: " + info.SharedMemoryPerBlock); }
public static bool AnySupportedGpu() { var devicesCount = CudaContext.GetDeviceCount(); return(Enumerable .Range(0, devicesCount) .Any(deviceId => CudaContext.GetDeviceInfo(deviceId).ComputeCapability.Major >= 2)); }
public GPUInfo(int deviceID) { CudaDeviceProperties props = CudaContext.GetDeviceInfo(deviceID); DeviceID = deviceID; DeviceProperties = props; TotalVRam = props.TotalGlobalMemory; FreeVRam = props.TotalConstantMemory; }
private static CudaContext ContextWithDevice(CudaDeviceProperties device) { int deviceIndex = Enumerable .Range(0, CudaContext.GetDeviceCount()) .Where(i => CudaContext.GetDeviceInfo(i).Equals(device)) .FirstOrDefault(); return(new CudaContext(deviceIndex)); }
public CudaProcessor(int deviceId) { ctx = new CudaContext(deviceId, true); var props = ctx.GetDeviceInfo(); defaultBlockCount = props.MultiProcessorCount * 32; defaultThreadsPerBlock = props.MaxThreadsPerBlock; warpSize = props.WarpSize; }
public SomeState(int deviceId) { // note that this initializes a lot of things and binds *to the thread* ctx = new CudaContext(deviceId, true); var props = ctx.GetDeviceInfo(); defaultBlockCount = props.MultiProcessorCount * 32; defaultThreadsPerBlock = props.MaxThreadsPerBlock; warpSize = props.WarpSize; }
//static float3[] h_A; //static float3[] h_C; //static CudaDeviceVariable<float3> d_A; //static CudaDeviceVariable<float3> d_C; public CalculateHeatmap() { ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); dev = ctx.GetDeviceInfo(); Console.WriteLine("Using CUDA Device {0} compute level {1} timeout {2}", dev.DeviceName, dev.ComputeCapability, dev.KernelExecTimeoutEnabled ? "enabled" : "disabled"); string resName; resName = @"C:\WEDEV\GpuImplementations\GpuInterpolation\RasterInterpolation_x64.ptx"; Console.WriteLine("Loading Interpolation Kernel"); InterpolateKernel = ctx.LoadKernelPTX(resName, "RasterInterpolate"); }
public CudaFloat32Handler(int deviceId = 0) : base(new CudaFloat32BackendHandle(deviceId, backendTag: -1)) { _cudaBackendHandle = (CudaFloat32BackendHandle)DiffsharpBackendHandle; DeviceId = _cudaBackendHandle.CudaContext.DeviceId; CudaDeviceProperties deviceProperties = CudaContext.GetDeviceInfo(deviceId); _logger.Info($"Using CUDA device {deviceProperties.DeviceName} with device id {deviceId} (compute capability {deviceProperties.ComputeCapability}, " + $"memory {deviceProperties.TotalGlobalMemory / (1024 * 1024)}MB)."); RegisterContext(_cudaBackendHandle.CudaContext, _cudaBackendHandle.CudaStream); }
public DeviceState(int deviceId) { CudaContext = new CudaContext(deviceId); DeviceInfo = CudaContext.GetDeviceInfo(); BlasHandles = new ObjectPool <CudaBlas>(1, () => { CudaContext.SetCurrent(); return(new CudaBlas()); }, blas => blas.Dispose()); MemoryAllocator = new PoolingDeviceAllocator(CudaContext); ScratchSpace = AllocScratchSpace(CudaContext, DeviceInfo); }
private static IList <int> GetPerformanceOrderedDeviceIds() { var cudaDevices = new List <Tuple <int, CudaDeviceProperties> >(); for (int i = 0; i < CudaContext.GetDeviceCount(); i++) { cudaDevices.Add(Tuple.Create(i, CudaContext.GetDeviceInfo(i))); } return(cudaDevices .Where(device => device.Item2.DriverVersion.Major != 999) // remove gpu emulators .OrderByDescending(device => device.Item2.ComputeCapability) .ThenByDescending(device => DevicePerformanceValue(device.Item2)) .Select(device => device.Item1) .ToList()); }
private void SVP_plugin_v3_Load(object sender, EventArgs e) { if (this.ParentForm != null) { mv = (mainView)Application.OpenForms[0]; //--this links mv property to signalplant application } numFreq1.Maximum = System.Convert.ToDecimal(mainView.sampleFrequency / 2); numFreq2.Maximum = System.Convert.ToDecimal(mainView.sampleFrequency / 2); numOrder.Enabled = true; numFreq1.Enabled = true; numFreq2.Enabled = false; label9.Text = Get_CUDA_DriversDirectory(); if (label9.Text == "No CUDA drivers found.") { label9.BackColor = Color.Red; MessageBox.Show("No CUDA drivers found."); cudaOK = false; } int deviceCount = CudaContext.GetDeviceCount(); List <CudaDeviceProperties> result = new List <CudaDeviceProperties>(deviceCount); for (int i = 0; i < deviceCount; i++) { result.Add(CudaContext.GetDeviceInfo(i)); } label1.Text = result[0].DeviceName; label2.Text = (result[0].TotalGlobalMemory / (1024 * 1024)).ToString() + " MB"; label3.Text = (result[0].ClockRate / 1000).ToString() + " MHz"; if (result[0].ComputeCapability.Major >= 3) { label4.Text = (result[0].ComputeCapability).ToString(); } else { label4.Text = (result[0].ComputeCapability).ToString() + ". GPU Compute capability not supported."; label4.BackColor = Color.Red; MessageBox.Show("GPU Compute capability not supported. Compute capability 3.0 or greater is supported."); cudaOK = false; } label10.Visible = false; if (cudaOK == false) { this.Enabled = false; } refrControls(); }
public static CUDA[] GetDevices() { try { var deviceCount = GetDeviceCount(); var deviceList = new List <CUDA>(deviceCount); Log(Level.Info, $"{deviceCount} devices found"); for (var i = 0; i < deviceCount; i++) { var deviceInfo = CudaContext.GetDeviceInfo(i); var availableMemory = 0ul; using (var context = new CudaContext(deviceInfo.PciDeviceId)) availableMemory = context.GetFreeDeviceMemorySize(); deviceList.Add(new CUDA() { Type = PlatformType.NVIDIA_CUDA, DeviceID = i, Info = deviceInfo, AvailableMemory = availableMemory, Allow = (Math.Round(availableMemory / Math.Pow(2, 30), 1) >= RequiredGPUMemoryGB()) && (deviceInfo.ComputeCapability.Major >= 6 && deviceInfo.ComputeCapability.Minor >= 1) }); } Log(Level.Info, $"{deviceList.Count(d => d.Allow)} devices with video memory of >{RequiredGPUMemoryGB()}GB"); return(deviceList.ToArray()); } catch (Exception ex) { Logger.Log(ex); Log(Level.Error, "Failed to get CUDA devices"); return(Enumerable.Empty <CUDA>().ToArray()); } }
//public void SetConstantVariable(string name, CUdeviceptr value) { m_kernel.SetConstantVariable(name, value); } public MyCudaKernel(string kernelName, CUmodule module, CudaContext cuda, int GPU) { m_GPU = GPU; m_kernel = new CudaKernel(kernelName, module, cuda); MAX_THREADS = cuda.GetDeviceInfo().MaxThreadsPerBlock; //TODO: maybe move to KernelFactory }
/// <summary/> public cudaOccDeviceProp(int deviceID) : this(CudaContext.GetDeviceInfo(deviceID)) { }
static void Main(string[] args) { int cuda_device = 0; int nstreams = 4; // number of streams for CUDA calls int nreps = 10; // number of times each experiment is repeated int n = 16 * 1024 * 1024; // number of ints in the data set int nbytes = n * sizeof(int); // number of data bytes dim3 threads, blocks; // kernel launch configuration float elapsed_time, time_memcpy, time_kernel; // timing variables float scale_factor = 1.0f; // allocate generic memory and pin it laster instead of using cudaHostAlloc() // Untested in C#, so stick to cudaHostAlloc(). bool bPinGenericMemory = false; // we want this to be the default behavior CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync int niterations; // number of iterations for the loop inside the kernel ShrQATest.shrQAStart(args); Console.WriteLine("[ simpleStreams ]"); foreach (var item in args) { if (item.Contains("help")) { printHelp(); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED); } } bPinGenericMemory = false; foreach (var item in args) { if (item.Contains("use_generic_memory")) { bPinGenericMemory = true; } } for (int i = 0; i < args.Length; i++) { if (args[i].Contains("sync_method")) { int temp = -1; bool error = false; if (i < args.Length - 1) { error = int.TryParse(args[i + 1], out temp); switch (temp) { case 0: device_sync_method = CUCtxFlags.SchedAuto; break; case 1: device_sync_method = CUCtxFlags.SchedSpin; break; case 2: device_sync_method = CUCtxFlags.SchedYield; break; case 4: device_sync_method = CUCtxFlags.BlockingSync; break; default: error = true; break; } } if (!error) { Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]); nreps = 100; } else { Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } } } int num_devices = CudaContext.GetDeviceCount(); if (0 == num_devices) { Console.Write("your system does not have a CUDA capable device, waiving test...\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } cuda_device = CudaContext.GetMaxGflopsDeviceId(); CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device); if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1)) { Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName); } if (deviceProp.ComputeCapability.Major >= 2) { niterations = 100; } else { if (deviceProp.ComputeCapability.Minor > 1) { niterations = 5; } else { niterations = 1; // reduced workload for compute capability 1.0 and 1.1 } } // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case... if (bPinGenericMemory) { Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No"); if (deviceProp.CanMapHostMemory == false) { Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n"); bPinGenericMemory = false; } } // Anything that is less than 32 Cores will have scaled down workload scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f); n = (int)Math.Round((float)n / scale_factor); Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor); Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n", deviceProp.MultiProcessorCount, ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor), ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount); Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor); Console.Write("> array_size = {0}\n\n", n); // enable use of blocking sync, to reduce CPU usage Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]); CudaContext ctx; if (bPinGenericMemory) { ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost); } else { ctx = new CudaContext(cuda_device, device_sync_method); } //Load Kernel image from resources string resName; if (IntPtr.Size == 8) { resName = "simpleStreams_x64.ptx"; } else { resName = "simpleStreams.ptx"; } string resNamespace = "simpleStreams"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array"); // allocate host memory int c = 5; // value to which the array will be initialized int[] h_a = null; // pointer to the array data in host memory CudaPageLockedHostMemory <int> hAligned_a = null; // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT) //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a). //In C++ hAligned_a would point somewhere inside the h_a array. AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes); Console.Write("\nStarting Test\n"); // allocate device memory CudaDeviceVariable <int> d_c = c; //using new implicit cast to allocate memory and asign value CudaDeviceVariable <int> d_a = new CudaDeviceVariable <int>(nbytes / sizeof(int)); CudaStream[] streams = new CudaStream[nstreams]; for (int i = 0; i < nstreams; i++) { streams[i] = new CudaStream(); } // create CUDA event handles // use blocking sync CudaEvent start_event, stop_event; CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default); start_event = new CudaEvent(eventflags); stop_event = new CudaEvent(eventflags); // time memcopy from device start_event.Record(); // record in stream-0, to ensure that all previous CUDA calls have completed hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream); stop_event.Record(); stop_event.Synchronize(); // block until the event is actually recorded time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("memcopy:\t{0:0.00}\n", time_memcpy); // time kernel threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations); stop_event.Record(); stop_event.Synchronize(); time_kernel = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("kernel:\t\t{0:0.00}\n", time_kernel); ////////////////////////////////////////////////////////////////////// // time non-streamed execution for reference threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); for (int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations); hAligned_a.SynchronCopyToHost(d_a); } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy); ////////////////////////////////////////////////////////////////////// // time execution with nstreams streams threads = new dim3(512, 1); blocks = new dim3(n / (int)(nstreams * threads.x), 1); byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness for (int i = 0; i < nbytes; i++) { memset[i] = 255; } System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes); d_a.Memset(0); // set device memory to all 0s, for testing correctness start_event.Record(); for (int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; // asynchronously launch nstreams kernels, each operating on its own portion of data for (int i = 0; i < nstreams; i++) { init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations); } // asynchronously launch nstreams memcopies. Note that memcopy in stream x will only // commence executing when all previous CUDA calls in stream x have completed for (int i = 0; i < nstreams; i++) { hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream); } } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams); // check whether the output is correct Console.Write("-------------------------------\n"); //We can directly access data in hAligned_a using the [] operator, but copying //data first to h_a is faster. System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int)); bool bResults = correct_data(h_a, n, c * nreps * niterations); // release resources for (int i = 0; i < nstreams; i++) { streams[i].Dispose(); } start_event.Dispose(); stop_event.Dispose(); hAligned_a.Dispose(); d_a.Dispose(); d_c.Dispose(); CudaContext.ProfilerStop(); ctx.Dispose(); Console.ReadKey(); ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
private void initGLAndCuda() { //Create render target control m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default); m_renderControl.Dock = DockStyle.Fill; m_renderControl.BackColor = Color.White; m_renderControl.BorderStyle = BorderStyle.FixedSingle; m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown); m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove); m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown); m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged); panel1.Controls.Add(m_renderControl); Console.WriteLine(" OpenGL device is Available"); int deviceID = CudaContext.GetMaxGflopsDeviceId(); ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync); string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount); Console.WriteLine(console); CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx); hvfield = new cData[DS]; dvfield = new CudaPitchedDeviceVariable <cData>(DIM, DIM); tPitch = dvfield.Pitch; dvfield.CopyToDevice(hvfield); vxfield = new CudaDeviceVariable <cData>(DS); vyfield = new CudaDeviceVariable <cData>(DS); // Create particle array particles = new cData[DS]; initParticles(particles, DIM, DIM); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding); planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding); GL.GenBuffers(1, out vbo); GL.BindBuffer(BufferTarget.ArrayBuffer, vbo); GL.BufferData <cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw); int bsize; GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize); if (bsize != DS * cData.SizeOf) { throw new Exception("Sizes don't match."); } GL.BindBuffer(BufferTarget.ArrayBuffer, 0); cuda_vbo_resource = new CudaGraphicsInteropResourceCollection(); cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None)); texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); stopwatch = new CudaStopWatch(CUEventFlags.Default); reshape(); isInit = true; display(); }
static void Main(string[] args) { try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, !fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) : CUCtxFlags.MapHost); meanSeedA = ctx.LoadKernelPTX(resourceStream, "FluffySeed2A"); meanSeedA.BlockDimensions = 128; meanSeedA.GridDimensions = 2048; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB.BlockDimensions = 128; meanSeedB.GridDimensions = 2048; meanSeedB.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanSeedB_4 = ctx.LoadKernelPTX(resourceStream, "FluffySeed2B"); meanSeedB_4.BlockDimensions = 128; meanSeedB_4.GridDimensions = 1024; meanSeedB_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_2 = ctx.LoadKernelPTX(resourceStream, "FluffyRound"); meanRound_2.BlockDimensions = 512; meanRound_2.GridDimensions = 2048; meanRound_2.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(resourceStream, "FluffyRound_J"); meanRoundJoin.BlockDimensions = 512; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(resourceStream, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(resourceStream, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_A * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE * 2); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); streamSecondary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Out of video memory! Only {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && loopCnt++ > 100) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[40]; d_indexesB.CopyToHost(s.nonces, 0, 0, 40 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); lock (Comms.graphSolutionsOut) { Comms.graphSolutionsOut.Enqueue(s); } Comms.SetEvent(); } uint[] count; do { if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } currentJob = currentJob.Next(); Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 0); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 16); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 32); meanSeedB_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, 48); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer + ((BUFFER_SIZE_A * 8) / 4) * 2, d_bufferB.DevicePointer, d_indexesA.DevicePointer + (2048 * 4), d_indexesB.DevicePointer + (4096 * 4), DUCK_EDGES_A, DUCK_EDGES_B / 2); meanRound_2.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_A, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer - (BUFFER_SIZE_B * 8), d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); //d_indexesA.MemsetAsync(0, streamPrimary.Stream); //meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4); for (int i = 0; i < trimRounds; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); ctx.Synchronize(); streamPrimary.Synchronize(); count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 4194304) { // trouble count[0] = 4194304; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); timer.Stop(); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges, h {3}", timer.ElapsedMilliseconds, count[0], deviceID, currentJob.height)); }while((currentJob.height != Comms.nextJob.height) && (!Comms.IsTerminated) && (!TEST)); if (TEST) { //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); CGraph cg = FinderBag.GetFinder(); if (cg == null) { continue; } cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); //currentJob = currentJob.Next(); Task.Factory.StartNew(() => { Stopwatch sw = new Stopwatch(); sw.Start(); if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } sw.Stop(); if (++trims % 50 == 0) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); //Console.WriteLine("Duped edges: {0}", cg.dupes); Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); }); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); ; } else { CGraph cg = FinderBag.GetFinder(); cg.SetEdges(h_a, (int)count[0]); cg.SetHeader(currentJob); Task.Factory.StartNew(() => { if (count[0] < 200000) { try { if (findersInFlight++ < 3) { Stopwatch cycleTime = new Stopwatch(); cycleTime.Start(); cg.FindSolutions(graphSolutions); cycleTime.Stop(); AdjustTrims(cycleTime.ElapsedMilliseconds); if (graphSolutions.Count > 0) { solutions++; } } else { Logger.Log(LogLevel.Warning, "CPU overloaded!"); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Cycle finder crashed: " + ex.Message); } finally { findersInFlight--; FinderBag.ReturnFinder(cg); } } }); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(5000).Wait(); } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }
// 1.Простой тест // 2.Тест с передачей и приемом больших и разных структур public string GetSummary() { string s = ""; //int deviceID = 0; //CudaContext ctx = new CudaContext(deviceID, CUCtxFlags.MapHost | CUCtxFlags.BlockingSync); //for default setting with device 0: //CudaContext ctx = new CudaContext(); int deviceCount = CudaContext.GetDeviceCount(); s += $"deviceCount = {deviceCount}\n"; int devID = CudaContext.GetMaxGflopsDeviceId(); s += $"GetMaxGflopsDeviceId = {devID}\n"; //return s; for (int deviceID = 0; deviceID < deviceCount; deviceID++) { s += $"----- DeviceID = {deviceID} -----\n"; CudaDeviceProperties props = CudaContext.GetDeviceInfo(deviceID); s += $"DeviceName = {props.DeviceName}\n"; s += $"DriverVersion = {props.DriverVersion.ToString()}\n"; s += $"CUDA ComputeCapability = {props.ComputeCapability.ToString()}\n"; s += $"Integrated = {props.Integrated.ToString()}\n"; s += $"MultiProcessorCount = {props.MultiProcessorCount.ToString()}\n"; s += $"ClockRate = {(props.ClockRate/1000).ToString()} MHz\n"; s += $"TotalGlobalMemory = {(props.TotalGlobalMemory / 1000000).ToString()} Mb\n"; s += $"MemoryClockRate = {(props.MemoryClockRate / 1000).ToString()} MHz\n"; s += $"GlobalMemoryBusWidth = {props.GlobalMemoryBusWidth.ToString()} bit\n"; s += $"maxGridSize[3] (MaxGridDim) = ({props.MaxGridDim.x}; {props.MaxGridDim.y}; {props.MaxGridDim.z}) \n"; s += $"maxThreadsPerBlock = {props.MaxThreadsPerBlock} \n"; s += $"maxThreadsDim[3] (MaxBlockDim) = ({props.MaxBlockDim.x}; {props.MaxBlockDim.y}; {props.MaxBlockDim.z}) \n"; s += $"MaxThreadsPerMultiProcessor = {props.MaxThreadsPerMultiProcessor} \n"; } return(s); /* + name[256] = is an ASCII string identifying the device; + uuid is a 16-byte unique identifier. + totalGlobalMem = is the total amount of global memory available on the device in bytes; + sharedMemPerBlock is the maximum amount of shared memory available to a thread block in bytes; + regsPerBlock is the maximum number of 32-bit registers available to a thread block; + warpSize is the warp size in threads; + memPitch is the maximum pitch in bytes allowed by the memory copy functions that involve memory regions allocated through cudaMallocPitch(); + maxThreadsPerBlock = is the maximum number of threads per block; + maxThreadsDim[3] = MaxBlockDim = contains the maximum size of each dimension of a block; + maxGridSize[3] = MaxGridDim = contains the maximum size of each dimension of a grid; + clockRate = is the clock frequency in kilohertz; + totalConstMem is the total amount of constant memory available on the device in bytes; + major, minor are the major and minor revision numbers defining the device's compute capability; + textureAlignment is the alignment requirement; texture base addresses that are aligned to textureAlignment bytes do not need an offset applied to texture fetches; + texturePitchAlignment is the pitch alignment requirement for 2D texture references that are bound to pitched memory; + deviceOverlap is 1 if the device can concurrently copy memory between host and device while executing a kernel, or 0 if not. Deprecated, use instead asyncEngineCount. + multiProcessorCount = is the number of multiprocessors on the device; + kernelExecTimeoutEnabled is 1 if there is a run time limit for kernels executed on the device, or 0 if not. + integrated = is 1 if the device is an integrated (motherboard) GPU and 0 if it is a discrete (card) component. + canMapHostMemory is 1 if the device can map host memory into the CUDA address space for use with cudaHostAlloc()/cudaHostGetDevicePointer(), or 0 if not; + computeMode is the compute mode that the device is currently in. Available modes are as follows: + cudaComputeModeDefault: Default mode - Device is not restricted and multiple threads can use cudaSetDevice() with this device. + cudaComputeModeExclusive: Compute-exclusive mode - Only one thread will be able to use cudaSetDevice() with this device. + cudaComputeModeProhibited: Compute-prohibited mode - No threads can use cudaSetDevice() with this device. + cudaComputeModeExclusiveProcess: Compute-exclusive-process mode - Many threads in one process will be able to use cudaSetDevice() with this device. + If cudaSetDevice() is called on an already occupied device with computeMode cudaComputeModeExclusive, cudaErrorDeviceAlreadyInUse will be immediately returned indicating the device cannot be used. When an occupied exclusive mode device is chosen with cudaSetDevice, all subsequent non-device management runtime functions will return cudaErrorDevicesUnavailable. + maxTexture1D is the maximum 1D texture size. + maxTexture1DMipmap is the maximum 1D mipmapped texture texture size. + maxTexture1DLinear is the maximum 1D texture size for textures bound to linear memory. + maxTexture2D[2] contains the maximum 2D texture dimensions. + maxTexture2DMipmap[2] contains the maximum 2D mipmapped texture dimensions. + maxTexture2DLinear[3] contains the maximum 2D texture dimensions for 2D textures bound to pitch linear memory. + maxTexture2DGather[2] contains the maximum 2D texture dimensions if texture gather operations have to be performed. + maxTexture3D[3] contains the maximum 3D texture dimensions. + maxTexture3DAlt[3] contains the maximum alternate 3D texture dimensions. + maxTextureCubemap is the maximum cubemap texture width or height. + maxTexture1DLayered[2] contains the maximum 1D layered texture dimensions. + maxTexture2DLayered[3] contains the maximum 2D layered texture dimensions. + maxTextureCubemapLayered[2] contains the maximum cubemap layered texture dimensions. + maxSurface1D is the maximum 1D surface size. + maxSurface2D[2] contains the maximum 2D surface dimensions. + maxSurface3D[3] contains the maximum 3D surface dimensions. + maxSurface1DLayered[2] contains the maximum 1D layered surface dimensions. + maxSurface2DLayered[3] contains the maximum 2D layered surface dimensions. + maxSurfaceCubemap is the maximum cubemap surface width or height. + maxSurfaceCubemapLayered[2] contains the maximum cubemap layered surface dimensions. + surfaceAlignment specifies the alignment requirements for surfaces. + concurrentKernels is 1 if the device supports executing multiple kernels within the same context simultaneously, or 0 if not. It is not guaranteed that multiple kernels will be resident on the device concurrently so this feature should not be relied upon for correctness; + ECCEnabled is 1 if the device has ECC support turned on, or 0 if not. + pciBusID is the PCI bus identifier of the device. + pciDeviceID is the PCI device (sometimes called slot) identifier of the device. + pciDomainID is the PCI domain identifier of the device. + tccDriver is 1 if the device is using a TCC driver or 0 if not. + asyncEngineCount is 1 when the device can concurrently copy memory between host and device while executing a kernel. It is 2 when the device can concurrently copy memory between host and device in both directions and execute a kernel at the same time. It is 0 if neither of these is supported. + unifiedAddressing is 1 if the device shares a unified address space with the host and 0 otherwise. + memoryClockRate is the peak memory clock frequency in kilohertz. + memoryBusWidth is the memory bus width in bits. + l2CacheSize is L2 cache size in bytes. + maxThreadsPerMultiProcessor is the number of maximum resident threads per multiprocessor. + streamPrioritiesSupported is 1 if the device supports stream priorities, or 0 if it is not supported. + globalL1CacheSupported is 1 if the device supports caching of globals in L1 cache, or 0 if it is not supported. + localL1CacheSupported is 1 if the device supports caching of locals in L1 cache, or 0 if it is not supported. + sharedMemPerMultiprocessor is the maximum amount of shared memory available to a multiprocessor in bytes; this amount is shared by all thread blocks simultaneously resident on a multiprocessor; + regsPerMultiprocessor is the maximum number of 32-bit registers available to a multiprocessor; this number is shared by all thread blocks simultaneously resident on a multiprocessor; + managedMemory is 1 if the device supports allocating managed memory on this system, or 0 if it is not supported. + isMultiGpuBoard is 1 if the device is on a multi-GPU board (e.g. Gemini cards), and 0 if not; + multiGpuBoardGroupID is a unique identifier for a group of devices associated with the same board. Devices on the same multi-GPU board will share the same identifier; + singleToDoublePrecisionPerfRatio is the ratio of single precision performance (in floating-point operations per second) to double precision performance. + pageableMemoryAccess is 1 if the device supports coherently accessing pageable memory without calling cudaHostRegister on it, and 0 otherwise. + concurrentManagedAccess is 1 if the device can coherently access managed memory concurrently with the CPU, and 0 otherwise. + computePreemptionSupported is 1 if the device supports Compute Preemption, and 0 otherwise. + canUseHostPointerForRegisteredMem is 1 if the device can access host registered memory at the same virtual address as the CPU, and 0 otherwise. + cooperativeLaunch is 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernel, and 0 otherwise. + cooperativeMultiDeviceLaunch is 1 if the device supports launching cooperative kernels via cudaLaunchCooperativeKernelMultiDevice, and 0 otherwise. + pageableMemoryAccessUsesHostPageTables is 1 if the device accesses pageable memory via the host's page tables, and 0 otherwise. + directManagedMemAccessFromHost is 1 if the host can directly access managed memory on the device without migration, and 0 otherwise. */ /* * deviceCount = 1 * GetMaxGflopsDeviceId = 0 * ----- DeviceID = 0 ----- * DeviceName = GeForce GTX 1050 Ti * DriverVersion = 10.20 * CUDA ComputeCapability = 6.1 * Integrated = False * MultiProcessorCount = 6 * ClockRate = 1392 MHz * TotalGlobalMemory = 4294 Mb * MemoryClockRate = 3504 MHz * GlobalMemoryBusWidth = 128 bit * maxGridSize[3] (MaxGridDim) = (2147483647; 65535; 65535) * maxThreadsPerBlock = 1024 * maxThreadsDim[3] (MaxBlockDim) = (1024; 1024; 64) * MaxThreadsPerMultiProcessor = 2048 */ }
public CudaProcessor() { ctx = new CudaContext(0, false); ctx.GetDeviceInfo(); }
/* PageRank * Find PageRank for a graph with a given transition probabilities, a bookmark vector of dangling vertices, and the damping factor. * This is equivalent to an eigenvalue problem where we want the eigenvector corresponding to the maximum eigenvalue. * By construction, the maximum eigenvalue is 1. * The eigenvalue problem is solved with the power method. * * Initially : * V = 6 * E = 10 * * Edges W * 0 -> 1 0.50 * 0 -> 2 0.50 * 2 -> 0 0.33 * 2 -> 1 0.33 * 2 -> 4 0.33 * 3 -> 4 0.50 * 3 -> 5 0.50 * 4 -> 3 0.50 * 4 -> 5 0.50 * 5 -> 3 1.00 * * bookmark (0.0, 1.0, 0.0, 0.0, 0.0, 0.0)^T note: 1.0 if i is a dangling node, 0.0 otherwise * * Source oriented representation (CSC): * destination_offsets {0, 1, 3, 4, 6, 8, 10} * source_indices {2, 0, 2, 0, 4, 5, 2, 3, 3, 4} * W0 = {0.33, 0.50, 0.33, 0.50, 0.50, 1.00, 0.33, 0.50, 0.50, 1.00} * * ---------------------------------- * * Operation : Pagerank with various damping factor * ---------------------------------- * * Expected output for alpha= 0.9 (result stored in pr_2) : (0.037210, 0.053960, 0.041510, 0.37510, 0.206000, 0.28620)^T * From "Google's PageRank and Beyond: The Science of Search Engine Rankings" Amy N. Langville & Carl D. Meyer */ static void Main(string[] args) { SizeT n = 6, nnz = 10, vertex_numsets = 3, edge_numsets = 1; float[] alpha1 = new float[] { 0.85f }, alpha2 = new float[] { 0.90f }; int i; int[] destination_offsets_h, source_indices_h; float[] weights_h, bookmark_h, pr_1, pr_2; //void** vertex_dim; // nvgraph variables GraphContext handle; GraphDescriptor graph; nvgraphCSCTopology32I CSC_input; cudaDataType[] edge_dimT = new cudaDataType[] { cudaDataType.CUDA_R_32F }; cudaDataType[] vertex_dimT; // use command-line specified CUDA device, otherwise use device with highest Gflops/s int cuda_device = 0; CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device); Console.WriteLine("> Detected Compute SM {0}.{1} hardware with {2} multi-processors", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor, deviceProp.MultiProcessorCount); if (deviceProp.ComputeCapability.Major < 3) { Console.WriteLine("> nvGraph requires device SM 3.0+"); Console.WriteLine("> Waiving."); return; } // Allocate host data destination_offsets_h = new int[n + 1]; source_indices_h = new int[nnz]; weights_h = new float[nnz]; bookmark_h = new float[n]; pr_1 = new float[n]; pr_2 = new float[n]; //vertex_dim = (void**)malloc(vertex_numsets * sizeof(void*)); vertex_dimT = new cudaDataType[vertex_numsets]; CSC_input = new nvgraphCSCTopology32I(); // Initialize host data //vertex_dim[0] = (void*)bookmark_h; vertex_dim[1]= (void*)pr_1, vertex_dim[2]= (void*)pr_2; vertex_dimT[0] = cudaDataType.CUDA_R_32F; vertex_dimT[1] = cudaDataType.CUDA_R_32F; vertex_dimT[2] = cudaDataType.CUDA_R_32F; weights_h[0] = 0.333333f; weights_h[1] = 0.500000f; weights_h[2] = 0.333333f; weights_h[3] = 0.500000f; weights_h[4] = 0.500000f; weights_h[5] = 1.000000f; weights_h[6] = 0.333333f; weights_h[7] = 0.500000f; weights_h[8] = 0.500000f; weights_h[9] = 0.500000f; destination_offsets_h[0] = 0; destination_offsets_h[1] = 1; destination_offsets_h[2] = 3; destination_offsets_h[3] = 4; destination_offsets_h[4] = 6; destination_offsets_h[5] = 8; destination_offsets_h[6] = 10; source_indices_h[0] = 2; source_indices_h[1] = 0; source_indices_h[2] = 2; source_indices_h[3] = 0; source_indices_h[4] = 4; source_indices_h[5] = 5; source_indices_h[6] = 2; source_indices_h[7] = 3; source_indices_h[8] = 3; source_indices_h[9] = 4; bookmark_h[0] = 0.0f; bookmark_h[1] = 1.0f; bookmark_h[2] = 0.0f; bookmark_h[3] = 0.0f; bookmark_h[4] = 0.0f; bookmark_h[5] = 0.0f; // Starting nvgraph handle = new GraphContext(); graph = handle.CreateGraphDecriptor(); GCHandle destination_offsets_handle = GCHandle.Alloc(destination_offsets_h, GCHandleType.Pinned); GCHandle source_indices_handle = GCHandle.Alloc(source_indices_h, GCHandleType.Pinned); CSC_input.nvertices = n; CSC_input.nedges = nnz; CSC_input.destination_offsets = destination_offsets_handle.AddrOfPinnedObject(); CSC_input.source_indices = source_indices_handle.AddrOfPinnedObject(); // Set graph connectivity and properties (tranfers) graph.SetGraphStructure(CSC_input); graph.AllocateVertexData(vertex_dimT); graph.AllocateEdgeData(edge_dimT); graph.SetVertexData(bookmark_h, 0); graph.SetVertexData(pr_1, 1); graph.SetVertexData(pr_2, 2); graph.SetEdgeData(weights_h, 0); // First run with default values graph.Pagerank(0, alpha1, 0, 0, 1, 0.0f, 0); // Get and print result graph.GetVertexData(pr_1, 1); Console.WriteLine("pr_1, alpha = 0.85"); for (i = 0; i < n; i++) { Console.WriteLine(pr_1[i]); } Console.WriteLine(); // Second run with different damping factor and an initial guess for (i = 0; i < n; i++) { pr_2[i] = pr_1[i]; } graph.SetVertexData(pr_2, 2); graph.Pagerank(0, alpha2, 0, 1, 2, 0.0f, 0); // Get and print result graph.GetVertexData(pr_2, 2); Console.WriteLine("pr_2, alpha = 0.90"); for (i = 0; i < n; i++) { Console.WriteLine(pr_2[i]); } Console.WriteLine(); //Clean graph.Dispose(); handle.Dispose(); Console.WriteLine("\nDone!"); }
static void Main(string[] args) { try { if (args.Length == 1 && args[0].ToLower().Contains("fidelity")) { string[] fseg = args[0].Split(':'); deviceID = int.Parse(fseg[1]); nonce = Int64.Parse(fseg[2]) - 1; range = int.Parse(fseg[3]); QTEST = true; } else { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error: " + ex.Message); } try { if (args.Length > 0) { deviceID = int.Parse(args[0]); } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Device ID parse error"); } try { if (args.Length > 1) { port = int.Parse(args[1]); Comms.ConnectToMaster(port); } else { TEST = true; Logger.CopyToConsole = true; CGraph.ShowCycles = true; } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Master connection error"); } try { if (args.Length > 3) { gpuCount = int.Parse(args[3]); fastCuda = gpuCount <= (Environment.ProcessorCount / 2); if (fastCuda) { Logger.Log(LogLevel.Info, "Using single GPU blocking mode"); } } } catch { } if (TEST) { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; } else { currentJob = nextJob = new Job() { jobID = 0, k0 = 0xf4956dc403730b01L, k1 = 0xe6d45de39c2a5a3eL, k2 = 0xcbf626a8afee35f6L, k3 = 0x4307b94b1a0c9980L, pre_pow = TestPrePow, timestamp = DateTime.Now }; if (!Comms.IsConnected()) { Console.WriteLine("Master connection failed, aborting"); Logger.Log(LogLevel.Error, "No master connection, exitting!"); return; } if (deviceID < 0) { int devCnt = CudaContext.GetDeviceCount(); GpuDevicesMessage gpum = new GpuDevicesMessage() { devices = new List <GpuDevice>(devCnt) }; for (int i = 0; i < devCnt; i++) { string name = CudaContext.GetDeviceName(i); var info = CudaContext.GetDeviceInfo(i); gpum.devices.Add(new GpuDevice() { deviceID = i, name = name, memory = info.TotalGlobalMemory }); } //Console.WriteLine(devCnt); Comms.gpuMsg = gpum; Comms.SetEvent(); //Console.WriteLine("event fired"); Task.Delay(1000).Wait(); //Console.WriteLine("closing"); Comms.Close(); return; } } try { var assembly = Assembly.GetEntryAssembly(); var resourceStream = assembly.GetManifestResourceStream("CudaSolver.kernel_x64.ptx"); ctx = new CudaContext(deviceID, /*!fastCuda ? (CUCtxFlags.BlockingSync | CUCtxFlags.MapHost) :*/ CUCtxFlags.MapHost); string pow = new StreamReader(resourceStream).ReadToEnd(); //pow = File.ReadAllText(@"kernel_x64.ptx"); Turing = ctx.GetDeviceInfo().MaxSharedMemoryPerMultiprocessor == 65536; using (var s = GenerateStreamFromString(pow)) { if (!Turing) { meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 }); meanSeedA.BlockDimensions = 512; meanSeedA.GridDimensions = 1024; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(s, "FluffyRound_A2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)40 }); meanRound.BlockDimensions = 512; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_A1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRound_4.BlockDimensions = 1024; meanRound_4.GridDimensions = 1024; meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_A3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRoundJoin.BlockDimensions = 1024; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(s, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } else { meanSeedA = ctx.LoadKernelPTX(s, "FluffySeed4K", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 }); meanSeedA.BlockDimensions = 512; meanSeedA.GridDimensions = 1024; meanSeedA.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound = ctx.LoadKernelPTX(s, "FluffyRound_C2", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRound.BlockDimensions = 1024; meanRound.GridDimensions = 4096; meanRound.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRound_4 = ctx.LoadKernelPTX(s, "FluffyRound_C1", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)64 }); meanRound_4.BlockDimensions = 1024; meanRound_4.GridDimensions = 1024; meanRound_4.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanRoundJoin = ctx.LoadKernelPTX(s, "FluffyRound_C3", new CUJITOption[] { CUJITOption.MaxRegisters }, new object[] { (uint)32 }); meanRoundJoin.BlockDimensions = 1024; meanRoundJoin.GridDimensions = 4096; meanRoundJoin.PreferredSharedMemoryCarveout = CUshared_carveout.MaxShared; meanTail = ctx.LoadKernelPTX(s, "FluffyTail"); meanTail.BlockDimensions = 1024; meanTail.GridDimensions = 4096; meanTail.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; meanRecover = ctx.LoadKernelPTX(s, "FluffyRecovery"); meanRecover.BlockDimensions = 256; meanRecover.GridDimensions = 2048; meanRecover.PreferredSharedMemoryCarveout = CUshared_carveout.MaxL1; } } } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create kernels: " + ex.Message); Task.Delay(500).Wait(); Comms.Close(); return; } try { d_buffer = new CudaDeviceVariable <ulong>(BUFFER_SIZE_U32 * (temp ? 8 : 1)); d_bufferMid = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 2)); d_bufferB = new CudaDeviceVariable <ulong>(d_buffer.DevicePointer + (BUFFER_SIZE_B * 8)); d_indexesA = new CudaDeviceVariable <uint>(INDEX_SIZE); d_indexesB = new CudaDeviceVariable <uint>(INDEX_SIZE); d_aux = new CudaDeviceVariable <uint>(512); Array.Clear(h_indexesA, 0, h_indexesA.Length); Array.Clear(h_indexesB, 0, h_indexesA.Length); d_indexesA = h_indexesA; d_indexesB = h_indexesB; streamPrimary = new CudaStream(CUStreamFlags.NonBlocking); } catch (Exception ex) { Task.Delay(200).Wait(); Logger.Log(LogLevel.Error, $"Mem alloc exception. Out of video memory? {ctx.GetFreeDeviceMemorySize()} free"); Task.Delay(500).Wait(); Comms.Close(); return; } try { AllocateHostMemory(true, ref h_a, ref hAligned_a, 1024 * 1024 * 32); } catch (Exception ex) { Logger.Log(LogLevel.Error, "Unable to create pinned memory."); Task.Delay(500).Wait(); Comms.Close(); return; } int loopCnt = 0; while (!Comms.IsTerminated) { try { if (!TEST && (Comms.nextJob.pre_pow == null || Comms.nextJob.pre_pow == "" || Comms.nextJob.pre_pow == TestPrePow)) { Logger.Log(LogLevel.Info, string.Format("Waiting for job....")); Task.Delay(1000).Wait(); continue; } if (!TEST && ((currentJob.pre_pow != Comms.nextJob.pre_pow) || (currentJob.origin != Comms.nextJob.origin))) { currentJob = Comms.nextJob; currentJob.timestamp = DateTime.Now; } if (!TEST && (currentJob.timestamp.AddMinutes(30) < DateTime.Now) && Comms.lastIncoming.AddMinutes(30) < DateTime.Now) { Logger.Log(LogLevel.Info, string.Format("Job too old...")); Task.Delay(1000).Wait(); continue; } // test runs only once if (TEST && ++loopCnt >= range) { Comms.IsTerminated = true; } Solution s; while (graphSolutions.TryDequeue(out s)) { meanRecover.SetConstantVariable <ulong>("recovery", s.GetUlongEdges()); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRecover.RunAsync(streamPrimary.Stream, s.job.k0, s.job.k1, s.job.k2, s.job.k3, d_indexesB.DevicePointer); streamPrimary.Synchronize(); s.nonces = new uint[32]; d_indexesB.CopyToHost(s.nonces, 0, 0, 32 * 4); s.nonces = s.nonces.OrderBy(n => n).ToArray(); //fidelity = (32-cycles_found / graphs_searched) * 32 solutions++; s.fidelity = ((double)solutions / (double)trims) * 32.0; //Console.WriteLine(s.fidelity.ToString("0.000")); if (Comms.IsConnected()) { Comms.graphSolutionsOut.Enqueue(s); Comms.SetEvent(); } if (QTEST) { Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine($"Solution for nonce {s.job.nonce}: {string.Join(' ', s.nonces)}"); Console.ResetColor(); } } if (QTEST) { currentJob = currentJob.NextSequential(ref nonce); Console.WriteLine($"Nonce: {nonce} K0: {currentJob.k0:X} K1: {currentJob.k1:X} K2: {currentJob.k2:X} K3: {currentJob.k3:X}"); } else { currentJob = currentJob.Next(); } Logger.Log(LogLevel.Debug, string.Format("GPU NV{4}:Trimming #{4}: {0} {1} {2} {3}", currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, currentJob.jobID, deviceID)); timer.Restart(); d_indexesA.MemsetAsync(0, streamPrimary.Stream); d_indexesB.MemsetAsync(0, streamPrimary.Stream); d_aux.MemsetAsync(0, streamPrimary.Stream); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer, d_indexesB.DevicePointer, 0); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 1, d_indexesB.DevicePointer + (4096 * 4), EDGE_SEG); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 2, d_indexesB.DevicePointer + (4096 * 8), EDGE_SEG * 2); meanSeedA.RunAsync(streamPrimary.Stream, currentJob.k0, currentJob.k1, currentJob.k2, currentJob.k3, d_bufferMid.DevicePointer + ((BUFFER_SIZE_A * 8) / 4 / 4) * 3, d_indexesB.DevicePointer + (4096 * 12), EDGE_SEG * 3); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 0); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 1024); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 2048); meanRound_4.RunAsync(streamPrimary.Stream, d_bufferMid.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_A / 4, DUCK_EDGES_B / 4, 3072); //streamPrimary.Synchronize(); //h_indexesA = d_indexesA; //h_indexesB = d_indexesB; //var sumA = h_indexesA.Sum(e => e); //var sumB = h_indexesB.Sum(e => e); //streamPrimary.Synchronize(); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRoundJoin.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 1, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 2, d_buffer.DevicePointer + ((BUFFER_SIZE_B * 8) / 4) * 3, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 2); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 0, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 1, d_aux.DevicePointer); d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 2, 2, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 2, DUCK_EDGES_B / 4, 3, d_aux.DevicePointer); for (int i = 0; i < (TEST ? 80 : trimRounds); i++) //for (int i = 0; i < 85; i++) { d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 4, d_aux.DevicePointer); d_indexesB.MemsetAsync(0, streamPrimary.Stream); meanRound.RunAsync(streamPrimary.Stream, d_buffer.DevicePointer, d_bufferB.DevicePointer, d_indexesA.DevicePointer, d_indexesB.DevicePointer, DUCK_EDGES_B / 4, DUCK_EDGES_B / 4, i * 2 + 5, d_aux.DevicePointer); } d_indexesA.MemsetAsync(0, streamPrimary.Stream); meanTail.RunAsync(streamPrimary.Stream, d_bufferB.DevicePointer, d_buffer.DevicePointer, d_indexesB.DevicePointer, d_indexesA.DevicePointer); Task.Delay((int)lastTrimMs).Wait(); streamPrimary.Synchronize(); uint[] count = new uint[2]; d_indexesA.CopyToHost(count, 0, 0, 8); if (count[0] > 131071) { // trouble count[0] = 131071; // log } hAligned_a.AsyncCopyFromDevice(d_buffer.DevicePointer, 0, 0, count[0] * 8, streamPrimary.Stream); streamPrimary.Synchronize(); System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, ((int)count[0] * 8) / sizeof(int)); trims++; timer.Stop(); lastTrimMs = (long)Math.Min(Math.Max((float)timer.ElapsedMilliseconds * 0.9f, 50), 500); currentJob.solvedAt = DateTime.Now; currentJob.trimTime = timer.ElapsedMilliseconds; //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); Logger.Log(LogLevel.Info, string.Format("GPU NV{2}: Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0], deviceID)); FinderBag.RunFinder(TEST, ref trims, count[0], h_a, currentJob, graphSolutions, timer); if (trims % 50 == 0 && TEST) { Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims / solutions); Console.ResetColor(); } /* * if (TEST) * { * //Console.WriteLine("Trimmed in {0}ms to {1} edges", timer.ElapsedMilliseconds, count[0]); * * CGraph cg = FinderBag.GetFinder(); * cg.SetEdges(h_a, (int)count[0]); * cg.SetHeader(currentJob); * * //currentJob = currentJob.Next(); * * Task.Factory.StartNew(() => * { * Stopwatch sw = new Stopwatch(); * sw.Start(); * * if (count[0] < 131071) * { * try * { * if (findersInFlight++ < 3) * { * Stopwatch cycleTime = new Stopwatch(); * cycleTime.Start(); * cg.FindSolutions(graphSolutions); * cycleTime.Stop(); * AdjustTrims(cycleTime.ElapsedMilliseconds); * //if (graphSolutions.Count > 0) solutions++; * } * else * Logger.Log(LogLevel.Warning, "CPU overloaded!"); * } * catch (Exception ex) * { * Logger.Log(LogLevel.Error, "Cycle finder error" + ex.Message); * } * finally * { * FinderBag.ReturnFinder(cg); * findersInFlight--; * } * } * * sw.Stop(); * * if (trims % 50 == 0) * { * Console.ForegroundColor = ConsoleColor.Green; * Console.WriteLine("SOLS: {0}/{1} - RATE: {2:F1}", solutions, trims, (float)trims/solutions ); * Console.ResetColor(); * } * //Console.WriteLine("Finder completed in {0}ms on {1} edges with {2} solution(s)", sw.ElapsedMilliseconds, count[0], graphSolutions.Count); * //Console.WriteLine("Duped edges: {0}", cg.dupes); * if (!QTEST) * Logger.Log(LogLevel.Info, string.Format("Finder completed in {0}ms on {1} edges with {2} solution(s) and {3} dupes", sw.ElapsedMilliseconds, count[0], graphSolutions.Count, cg.dupes)); * }); * * //h_indexesA = d_indexesA; * //h_indexesB = d_indexesB; * * //var sumA = h_indexesA.Sum(e => e); * //var sumB = h_indexesB.Sum(e => e); * * ; * } * else * { * CGraph cg = FinderBag.GetFinder(); * cg.SetEdges(h_a, (int)count[0]); * cg.SetHeader(currentJob); * * Task.Factory.StartNew(() => * { * if (count[0] < 131071) * { * try * { * if (findersInFlight++ < 3) * { * Stopwatch cycleTime = new Stopwatch(); * cycleTime.Start(); * cg.FindSolutions(graphSolutions); * cycleTime.Stop(); * AdjustTrims(cycleTime.ElapsedMilliseconds); * } * else * Logger.Log(LogLevel.Warning, "CPU overloaded!"); * } * catch (Exception ex) * { * Logger.Log(LogLevel.Warning, "Cycle finder crashed: " + ex.Message); * } * finally * { * FinderBag.ReturnFinder(cg); * findersInFlight--; * } * } * }); * } * */ } catch (Exception ex) { Logger.Log(LogLevel.Error, "Critical error in main cuda loop " + ex.Message); Task.Delay(500).Wait(); break; } } // clean up try { Task.Delay(500).Wait(); Comms.Close(); d_buffer.Dispose(); d_indexesA.Dispose(); d_indexesB.Dispose(); d_aux.Dispose(); streamPrimary.Dispose(); streamSecondary.Dispose(); hAligned_a.Dispose(); if (ctx != null) { ctx.Dispose(); } } catch { } }
public float3[] GetPointsGPU(int NumPoints) { int BlockSize = 512; if (NumPoints % BlockSize != 0) { throw new Exception("NumPoints must be divisible by " + BlockSize.ToString()); } int[] TriangleCounts = new int[GridCount + 1]; var Maxima = new float3[GridCount]; var Minima = new float3[GridCount]; TriangleCounts[0] = 0; for (int i = 0; i < GridCount; i++) { int LocalCount = TriangleCounts[i] + (int)Domains[i].TriangleCount; if (Domains[i].TriangleCount > BlockSize) { throw new Exception("STL File must have no more than " + BlockSize.ToString() + " Triangles"); } TriangleCounts[i + 1] = LocalCount; Minima[i] = STLReader.ToFloat3(Domains[i].Extrema.Min); Maxima[i] = STLReader.ToFloat3(Domains[i].Extrema.Max); } var Triangles = new TriangleSTL[TriangleCounts[GridCount]]; int id = 0; for (int i = 0; i < GridCount; i++) { for (int j = 0; j < TriangleCounts[i]; j++) { var LocalTri = Domains[i].Triangles[j]; Triangles[id] = new TriangleSTL(LocalTri); id++; } } var ctx = new CudaContext(1); var DeviceInfo = ctx.GetDeviceInfo(); var d_Triangles = new CudaDeviceVariable <TriangleSTL>(Triangles.Length); var d_TriangleCounts = new CudaDeviceVariable <int>(GridCount); var d_Minima = new CudaDeviceVariable <float3>(GridCount); var d_Maxima = new CudaDeviceVariable <float3>(GridCount); var d_Points = new CudaDeviceVariable <float3>(GridCount * NumPoints); var h_Points = new float3[GridCount * NumPoints]; var rng = new Random(0); // use a sequence that is repeatable over and over again for (int i = 0; i < GridCount * NumPoints; i++) { h_Points[i].x = (float)rng.NextDouble(); h_Points[i].y = (float)rng.NextDouble(); h_Points[i].z = (float)rng.NextDouble(); } int ctr = 0; for (int i = 0; i < GridCount; i++) { for (int j = 0; j < NumPoints; j++) { h_Points[ctr].x = Minima[i].x + h_Points[ctr].x * (Maxima[i].x - Minima[i].x); h_Points[ctr].y = Minima[i].y + h_Points[ctr].y * (Maxima[i].y - Minima[i].y); h_Points[ctr].z = Minima[i].z + h_Points[ctr].z * (Maxima[i].z - Minima[i].z); ctr++; } } d_Points = h_Points; d_Triangles = Triangles; d_TriangleCounts = TriangleCounts; d_Minima = Minima; d_Maxima = Maxima; // copy over to host // TODO generate grid on GPU instead of CPU var PointInPolygonKernel = ctx.LoadKernelPTX("PointInPolygon.ptx", "PointInPolygon"); var BlockDim = new dim3(BlockSize, 1, 1); var GridDim = new dim3(GridCount, 1, 1); PointInPolygonKernel.BlockDimensions = BlockDim; PointInPolygonKernel.GridDimensions = GridDim; PointInPolygonKernel.Run(GridCount, NumPoints, d_TriangleCounts.DevicePointer, d_Triangles.DevicePointer, d_Maxima.DevicePointer, d_Minima.DevicePointer, d_Points.DevicePointer); h_Points = d_Points; return(h_Points); // TODO Fix this to remove bad points }