Exemple #1
0
        public CudaError LaunchKernelWithStreamBinding(
            CudaStream stream,
            CudaKernel kernel,
            RuntimeKernelConfig config,
            IntPtr args,
            IntPtr kernelArgs)
        {
            var binding = stream.BindScoped();

            var result = LaunchKernel(
                kernel.FunctionPtr,
                config.GridDim.X,
                config.GridDim.Y,
                config.GridDim.Z,
                config.GroupDim.X,
                config.GroupDim.Y,
                config.GroupDim.Z,
                config.SharedMemoryConfig.DynamicArraySize,
                stream.StreamPtr,
                args,
                kernelArgs);

            binding.Recover();
            return(result);
        }
Exemple #2
0
        public CudaError MemcpyDeviceToDevice(
            IntPtr destinationDevice,
            IntPtr sourceDevice,
            IntPtr length,
            AcceleratorStream stream)
        {
            CudaStream cudaStream = stream as CudaStream;

            return(MemcpyDeviceToDevice(
                       destinationDevice,
                       sourceDevice,
                       length,
                       cudaStream?.StreamPtr ?? IntPtr.Zero));
        }
Exemple #3
0
        public CudaError LaunchKernelWithStruct <T>(
            CudaStream stream,
            CudaKernel kernel,
            RuntimeKernelConfig config,
            ref T args,
            int argsSizeInBytes)
            where T : unmanaged
        {
            // Setup object size
            var size = new IntPtr(argsSizeInBytes);

            Debug.Assert(
                argsSizeInBytes <= Interop.SizeOf <T>(),
                "Invalid argument size");

            // Pin object buffer in memory
            fixed(T *pArgs = &args)
            {
                // Setup unmanaged launch configuration for the driver
                var launchConfig = stackalloc void *[5];

                launchConfig[0] = (void *)1; // CU_LAUNCH_PARAM_BUFFER_POINTER
                launchConfig[1] = pArgs;
                launchConfig[2] = (void *)2; // CU_LAUNCH_PARAM_BUFFER_SIZE
                launchConfig[3] = &size;
                launchConfig[4] = (void *)0; // CU_LAUNCH_PARAM_END

                // Use existing launch configuration
                return(LaunchKernelWithStreamBinding(
                           stream,
                           kernel,
                           config,
                           IntPtr.Zero,
                           new IntPtr(launchConfig)));
            }
        }
Exemple #4
0
 /// <summary>
 /// Performs a Cuda memset operation.
 /// </summary>
 /// <typeparam name="T">The element type.</typeparam>
 /// <param name="stream">The Cuda stream to use (must not be null)</param>
 /// <param name="value">The value to write into the buffer.</param>
 /// <param name="targetView">The target view to write to.</param>
 public static void CudaMemSet <T>(
     CudaStream stream,
     byte value,
     in ArrayView <T> targetView)
Exemple #5
0
        /// <summary>
        /// Setups all required settings.
        /// </summary>
        private void SetupAccelerator()
        {
            Bind();

            CudaException.ThrowIfFailed(
                CurrentAPI.GetDeviceName(out string name, DeviceId));
            Name          = name;
            DefaultStream = new CudaStream(this, IntPtr.Zero);

            CudaException.ThrowIfFailed(
                CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId));
            MemorySize = total;

            // Resolve max grid size
            MaxGridSize = new Index3(
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, DeviceId));

            // Resolve max group size
            MaxGroupSize = new Index3(
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, DeviceId));

            // Resolve max threads per group
            MaxNumThreadsPerGroup = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, DeviceId);

            // Resolve max shared memory per block
            MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
                DeviceId);

            // Resolve total constant memory
            MaxConstantMemory = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId);

            // Resolve clock rate
            ClockRate = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId);

            // Resolve warp size
            WarpSize = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId);

            // Resolve number of multiprocessors
            NumMultiprocessors = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId);

            // Result max number of threads per multiprocessor
            MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
                DeviceId);

            // Resolve cache configuration
            CudaException.ThrowIfFailed(
                CurrentAPI.GetSharedMemoryConfig(out sharedMemoryConfiguration));
            CudaException.ThrowIfFailed(
                CurrentAPI.GetCacheConfig(out cacheConfiguration));

            // Setup architecture and backend
            CudaException.ThrowIfFailed(
                CurrentAPI.GetDeviceComputeCapability(
                    out int major,
                    out int minor,
                    DeviceId));
            Architecture = PTXArchitectureUtils.GetArchitecture(major, minor);

            CudaException.ThrowIfFailed(
                CurrentAPI.GetDriverVersion(out var driverVersion));
            InstructionSet = GetInstructionSet(Architecture, driverVersion);
            Init(new PTXBackend(
                     Context,
                     Architecture,
                     InstructionSet));
        }
Exemple #6
0
 /// <summary>
 /// Associates a CUDA stream with a cuFFT plan.
 /// </summary>
 public void SetStream(CudaStream cudaStream)
 {
     CuFFTException.ThrowIfFailed(
         API.SetStream(PlanHandle, cudaStream));
 }
        /// <summary>
        /// Setups all required settings.
        /// </summary>
        private void SetupAccelerator()
        {
            Bind();

            CudaException.ThrowIfFailed(
                CurrentAPI.GetDeviceName(out string name, DeviceId));
            Name          = name;
            DefaultStream = new CudaStream(this, IntPtr.Zero, false);

            CudaException.ThrowIfFailed(
                CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId));
            MemorySize = total;

            // Resolve max grid size
            MaxGridSize = new Index3(
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, DeviceId));

            // Resolve max group size
            MaxGroupSize = new Index3(
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, DeviceId),
                CurrentAPI.GetDeviceAttribute(
                    DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, DeviceId));

            // Resolve max threads per group
            MaxNumThreadsPerGroup = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, DeviceId);

            // Resolve max shared memory per block
            MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
                DeviceId);

            // Resolve total constant memory
            MaxConstantMemory = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId);

            // Resolve clock rate
            ClockRate = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId) / 1000;

            // Resolve memory clock rate
            MemoryClockRate = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, DeviceId) / 1000;

            // Resolve the bus width
            MemoryBusWidth = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, DeviceId);

            // Resolve warp size
            WarpSize = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId);

            // Resolve number of multiprocessors
            NumMultiprocessors = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId);

            // Result max number of threads per multiprocessor
            MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
                DeviceId);

            // Resolve the L2 cache size
            L2CacheSize = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, DeviceId);

            // Resolve the maximum amount of shared memory per multiprocessor
            MaxSharedMemoryPerMultiprocessor = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR,
                DeviceId);

            // Resolve the total number of registers per multiprocessor
            TotalNumRegistersPerMultiprocessor = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR,
                DeviceId);

            // Resolve the total number of registers per group
            TotalNumRegistersPerGroup = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, DeviceId);

            // Resolve the max memory pitch
            MaxMemoryPitch = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_PITCH, DeviceId);

            // Resolve the number of concurrent copy engines
            NumConcurrentCopyEngines = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, DeviceId);

            // Resolve whether this device has ECC support
            HasECCSupport = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_ECC_ENABLED, DeviceId) != 0;

            // Resolve whether this device supports managed memory
            SupportsManagedMemory = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, DeviceId) != 0;

            // Resolve whether this device supports compute preemption
            SupportsComputePreemption = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
                DeviceId) != 0;

            // Resolve the current driver mode
            DriverMode = (DeviceDriverMode)CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_TCC_DRIVER,
                DeviceId);

            // Resolve the PCI domain id
            PCIDomainId = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID,
                DeviceId);

            // Resolve the PCI device id
            PCIBusId = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID,
                DeviceId);

            // Resolve the PCI device id
            PCIDeviceId = CurrentAPI.GetDeviceAttribute(
                DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID,
                DeviceId);

            // Resolve cache configuration
            CudaException.ThrowIfFailed(
                CurrentAPI.GetSharedMemoryConfig(out sharedMemoryConfiguration));
            CudaException.ThrowIfFailed(
                CurrentAPI.GetCacheConfig(out cacheConfiguration));

            // Setup architecture and backend
            CudaException.ThrowIfFailed(
                CurrentAPI.GetDeviceComputeCapability(
                    out int major,
                    out int minor,
                    DeviceId));
            Architecture = PTXArchitectureUtils.GetArchitecture(major, minor);

            CudaException.ThrowIfFailed(
                CurrentAPI.GetDriverVersion(out var driverVersion));
            DriverVersion     = driverVersion;
            InstructionSet    = GetInstructionSet(Architecture, driverVersion);
            base.Capabilities = new CudaCapabilityContext(Architecture);

            Init(new PTXBackend(
                     Context,
                     Capabilities,
                     Architecture,
                     InstructionSet));
        }