public CudaError LaunchKernelWithStreamBinding( CudaStream stream, CudaKernel kernel, RuntimeKernelConfig config, IntPtr args, IntPtr kernelArgs) { var binding = stream.BindScoped(); var result = LaunchKernel( kernel.FunctionPtr, config.GridDim.X, config.GridDim.Y, config.GridDim.Z, config.GroupDim.X, config.GroupDim.Y, config.GroupDim.Z, config.SharedMemoryConfig.DynamicArraySize, stream.StreamPtr, args, kernelArgs); binding.Recover(); return(result); }
public CudaError MemcpyDeviceToDevice( IntPtr destinationDevice, IntPtr sourceDevice, IntPtr length, AcceleratorStream stream) { CudaStream cudaStream = stream as CudaStream; return(MemcpyDeviceToDevice( destinationDevice, sourceDevice, length, cudaStream?.StreamPtr ?? IntPtr.Zero)); }
public CudaError LaunchKernelWithStruct <T>( CudaStream stream, CudaKernel kernel, RuntimeKernelConfig config, ref T args, int argsSizeInBytes) where T : unmanaged { // Setup object size var size = new IntPtr(argsSizeInBytes); Debug.Assert( argsSizeInBytes <= Interop.SizeOf <T>(), "Invalid argument size"); // Pin object buffer in memory fixed(T *pArgs = &args) { // Setup unmanaged launch configuration for the driver var launchConfig = stackalloc void *[5]; launchConfig[0] = (void *)1; // CU_LAUNCH_PARAM_BUFFER_POINTER launchConfig[1] = pArgs; launchConfig[2] = (void *)2; // CU_LAUNCH_PARAM_BUFFER_SIZE launchConfig[3] = &size; launchConfig[4] = (void *)0; // CU_LAUNCH_PARAM_END // Use existing launch configuration return(LaunchKernelWithStreamBinding( stream, kernel, config, IntPtr.Zero, new IntPtr(launchConfig))); } }
/// <summary> /// Performs a Cuda memset operation. /// </summary> /// <typeparam name="T">The element type.</typeparam> /// <param name="stream">The Cuda stream to use (must not be null)</param> /// <param name="value">The value to write into the buffer.</param> /// <param name="targetView">The target view to write to.</param> public static void CudaMemSet <T>( CudaStream stream, byte value, in ArrayView <T> targetView)
/// <summary> /// Setups all required settings. /// </summary> private void SetupAccelerator() { Bind(); CudaException.ThrowIfFailed( CurrentAPI.GetDeviceName(out string name, DeviceId)); Name = name; DefaultStream = new CudaStream(this, IntPtr.Zero); CudaException.ThrowIfFailed( CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId)); MemorySize = total; // Resolve max grid size MaxGridSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, DeviceId)); // Resolve max group size MaxGroupSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, DeviceId)); // Resolve max threads per group MaxNumThreadsPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, DeviceId); // Resolve max shared memory per block MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, DeviceId); // Resolve total constant memory MaxConstantMemory = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId); // Resolve clock rate ClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId); // Resolve warp size WarpSize = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId); // Resolve number of multiprocessors NumMultiprocessors = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId); // Result max number of threads per multiprocessor MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, DeviceId); // Resolve cache configuration CudaException.ThrowIfFailed( CurrentAPI.GetSharedMemoryConfig(out sharedMemoryConfiguration)); CudaException.ThrowIfFailed( CurrentAPI.GetCacheConfig(out cacheConfiguration)); // Setup architecture and backend CudaException.ThrowIfFailed( CurrentAPI.GetDeviceComputeCapability( out int major, out int minor, DeviceId)); Architecture = PTXArchitectureUtils.GetArchitecture(major, minor); CudaException.ThrowIfFailed( CurrentAPI.GetDriverVersion(out var driverVersion)); InstructionSet = GetInstructionSet(Architecture, driverVersion); Init(new PTXBackend( Context, Architecture, InstructionSet)); }
/// <summary> /// Associates a CUDA stream with a cuFFT plan. /// </summary> public void SetStream(CudaStream cudaStream) { CuFFTException.ThrowIfFailed( API.SetStream(PlanHandle, cudaStream)); }
/// <summary> /// Setups all required settings. /// </summary> private void SetupAccelerator() { Bind(); CudaException.ThrowIfFailed( CurrentAPI.GetDeviceName(out string name, DeviceId)); Name = name; DefaultStream = new CudaStream(this, IntPtr.Zero, false); CudaException.ThrowIfFailed( CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId)); MemorySize = total; // Resolve max grid size MaxGridSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, DeviceId)); // Resolve max group size MaxGroupSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, DeviceId)); // Resolve max threads per group MaxNumThreadsPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, DeviceId); // Resolve max shared memory per block MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, DeviceId); // Resolve total constant memory MaxConstantMemory = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId); // Resolve clock rate ClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId) / 1000; // Resolve memory clock rate MemoryClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, DeviceId) / 1000; // Resolve the bus width MemoryBusWidth = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, DeviceId); // Resolve warp size WarpSize = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId); // Resolve number of multiprocessors NumMultiprocessors = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId); // Result max number of threads per multiprocessor MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, DeviceId); // Resolve the L2 cache size L2CacheSize = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, DeviceId); // Resolve the maximum amount of shared memory per multiprocessor MaxSharedMemoryPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, DeviceId); // Resolve the total number of registers per multiprocessor TotalNumRegistersPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, DeviceId); // Resolve the total number of registers per group TotalNumRegistersPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, DeviceId); // Resolve the max memory pitch MaxMemoryPitch = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_PITCH, DeviceId); // Resolve the number of concurrent copy engines NumConcurrentCopyEngines = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, DeviceId); // Resolve whether this device has ECC support HasECCSupport = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_ECC_ENABLED, DeviceId) != 0; // Resolve whether this device supports managed memory SupportsManagedMemory = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, DeviceId) != 0; // Resolve whether this device supports compute preemption SupportsComputePreemption = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, DeviceId) != 0; // Resolve the current driver mode DriverMode = (DeviceDriverMode)CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_TCC_DRIVER, DeviceId); // Resolve the PCI domain id PCIDomainId = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, DeviceId); // Resolve the PCI device id PCIBusId = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, DeviceId); // Resolve the PCI device id PCIDeviceId = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, DeviceId); // Resolve cache configuration CudaException.ThrowIfFailed( CurrentAPI.GetSharedMemoryConfig(out sharedMemoryConfiguration)); CudaException.ThrowIfFailed( CurrentAPI.GetCacheConfig(out cacheConfiguration)); // Setup architecture and backend CudaException.ThrowIfFailed( CurrentAPI.GetDeviceComputeCapability( out int major, out int minor, DeviceId)); Architecture = PTXArchitectureUtils.GetArchitecture(major, minor); CudaException.ThrowIfFailed( CurrentAPI.GetDriverVersion(out var driverVersion)); DriverVersion = driverVersion; InstructionSet = GetInstructionSet(Architecture, driverVersion); base.Capabilities = new CudaCapabilityContext(Architecture); Init(new PTXBackend( Context, Capabilities, Architecture, InstructionSet)); }