/// <summary> /// Disposes this Cuda buffer. /// </summary> protected override void DisposeAcceleratorObject(bool disposing) { CudaException.VerifyDisposed( disposing, CurrentAPI.FreeMemory(NativePtr)); NativePtr = IntPtr.Zero; }
/// <summary cref="MemoryBuffer{T, TIndex}.CopyFromView( /// AcceleratorStream, ArrayView{T}, Index1)"/> protected internal unsafe override void CopyFromView( AcceleratorStream stream, ArrayView <T> source, Index1 targetOffset) { var binding = Accelerator.BindScoped(); var sourceAddress = new IntPtr(source.LoadEffectiveAddress()); var targetAddress = new IntPtr(ComputeEffectiveAddress(targetOffset)); switch (source.AcceleratorType) { case AcceleratorType.CPU: CudaException.ThrowIfFailed(CudaAPI.Current.MemcpyHostToDevice( targetAddress, sourceAddress, new IntPtr(source.LengthInBytes), stream)); break; case AcceleratorType.Cuda: CudaException.ThrowIfFailed(CudaAPI.Current.MemcpyDeviceToDevice( targetAddress, sourceAddress, new IntPtr(source.LengthInBytes), stream)); break; default: throw new NotSupportedException( RuntimeErrorMessages.NotSupportedTargetAccelerator); } binding.Recover(); }
/// <inheritdoc/> protected override void DisposeAcceleratorObject(bool disposing) { CudaException.VerifyDisposed( disposing, CurrentAPI.DestroyEvent(EventPtr)); EventPtr = IntPtr.Zero; }
/// <summary> /// Loads a compiled kernel into the given Cuda context as kernel program. /// </summary> /// <param name="accelerator">The associated accelerator.</param> /// <param name="kernel">The source kernel.</param> /// <param name="launcher">The launcher method for the given kernel.</param> internal CudaKernel( CudaAccelerator accelerator, PTXCompiledKernel kernel, MethodInfo launcher) : base(accelerator, kernel, launcher) { var kernelLoaded = CurrentAPI.LoadModule( out modulePtr, kernel.PTXAssembly, out string errorLog); if (kernelLoaded != CudaError.CUDA_SUCCESS) { Trace.WriteLine("PTX Kernel loading failed:"); if (string.IsNullOrWhiteSpace(errorLog)) { Trace.WriteLine(">> No error information available"); } else { Trace.WriteLine(errorLog); } } CudaException.ThrowIfFailed(kernelLoaded); CudaException.ThrowIfFailed( CurrentAPI.GetModuleFunction( out functionPtr, modulePtr, kernel.Name)); }
/// <summary cref="MemoryBuffer{T, TIndex}.CopyToView( /// AcceleratorStream, ArrayView{T}, LongIndex1)"/> protected internal unsafe override void CopyToView( AcceleratorStream stream, ArrayView <T> target, LongIndex1 sourceOffset) { var binding = Accelerator.BindScoped(); var targetBuffer = target.Source; var sourceAddress = new IntPtr(ComputeEffectiveAddress(sourceOffset)); var targetAddress = new IntPtr(target.LoadEffectiveAddress()); var lengthInBytes = new IntPtr(target.LengthInBytes); switch (targetBuffer.AcceleratorType) { case AcceleratorType.CPU: case AcceleratorType.Cuda: CudaException.ThrowIfFailed( CurrentAPI.MemcpyAsync( targetAddress, sourceAddress, lengthInBytes, stream)); break; default: throw new NotSupportedException( RuntimeErrorMessages.NotSupportedTargetAccelerator); } binding.Recover(); }
/// <summary cref="MemoryBuffer{T, TIndex}.CopyFromViewInternal(ArrayView{T, Index}, AcceleratorType, TIndex, AcceleratorStream)"/> protected internal override void CopyFromViewInternal( ArrayView <T, Index> source, AcceleratorType acceleratorType, TIndex targetOffset, AcceleratorStream stream) { switch (acceleratorType) { case AcceleratorType.CPU: CudaException.ThrowIfFailed(CudaAPI.Current.MemcpyHostToDevice( GetSubView(targetOffset).Pointer, source.Pointer, new IntPtr(source.LengthInBytes), stream)); break; case AcceleratorType.Cuda: CudaException.ThrowIfFailed(CudaAPI.Current.MemcpyDeviceToDevice( GetSubView(targetOffset).Pointer, source.Pointer, new IntPtr(source.LengthInBytes), stream)); break; default: throw new NotSupportedException(RuntimeErrorMessages.NotSupportedTargetAccelerator); } }
/// <inheritdoc/> protected override ProfilingMarker AddProfilingMarkerInternal() { var profilingMarker = new CudaProfilingMarker(); CudaException.ThrowIfFailed( CurrentAPI.RecordEvent(profilingMarker.EventPtr, StreamPtr)); return(profilingMarker); }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (streamPtr != IntPtr.Zero) { CudaException.ThrowIfFailed(CudaAPI.Current.DestroyStream(streamPtr)); streamPtr = IntPtr.Zero; } }
/// <summary> /// Constructs a new cuda stream. /// </summary> /// <param name="accelerator">The associated accelerator.</param> internal CudaStream(Accelerator accelerator) : base(accelerator) { CudaException.ThrowIfFailed( CudaAPI.Current.CreateStream( out streamPtr, StreamFlags.CU_STREAM_NON_BLOCKING)); }
/// <summary> /// Disposes this Cuda kernel. /// </summary> protected override void DisposeAcceleratorObject(bool disposing) { CudaException.VerifyDisposed( disposing, CurrentAPI.DestroyModule(modulePtr)); functionPtr = IntPtr.Zero; modulePtr = IntPtr.Zero; }
/// <summary cref="AcceleratorStream.Synchronize"/> public override void Synchronize() { using (var binding = Accelerator.BindScoped()) { CudaException.ThrowIfFailed( CudaAPI.Current.SynchronizeStream(streamPtr)); } }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (NativePtr != IntPtr.Zero) { CudaException.ThrowIfFailed(CudaAPI.Current.FreeMemory(NativePtr)); NativePtr = IntPtr.Zero; } base.Dispose(disposing); }
/// <summary cref="AcceleratorStream.Synchronize"/> public override void Synchronize() { var binding = Accelerator.BindScoped(); CudaException.ThrowIfFailed( CurrentAPI.SynchronizeStream(streamPtr)); binding.Recover(); }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (modulePtr != IntPtr.Zero) { CudaException.ThrowIfFailed(CudaAPI.Current.DestroyModule(modulePtr)); functionPtr = IntPtr.Zero; modulePtr = IntPtr.Zero; } }
internal CudaKernel( CudaAccelerator accelerator, CompiledKernel kernel, MethodInfo launcher) : base(accelerator, kernel, launcher) { CudaException.ThrowIfFailed(CudaAPI.Current.LoadModule(out modulePtr, kernel.GetBuffer())); CudaException.ThrowIfFailed(CudaAPI.Current.GetModuleFunction(out functionPtr, modulePtr, kernel.EntryName)); }
/// <inheritdoc/> protected override ProfilingMarker AddProfilingMarkerInternal() { using var binding = Accelerator.BindScoped(); var profilingMarker = new CudaProfilingMarker(Accelerator); CudaException.ThrowIfFailed( CurrentAPI.RecordEvent(profilingMarker.EventPtr, StreamPtr)); return(profilingMarker); }
/// <summary> /// Constructs a new Cuda stream with given <see cref="StreamFlags"/>. /// </summary> /// <param name="accelerator">The associated accelerator.</param> /// <param name="flag"> /// Stream flag to use. Allows blocking and non-blocking streams. /// </param> internal CudaStream(Accelerator accelerator, StreamFlags flag) : base(accelerator) { CudaException.ThrowIfFailed( CurrentAPI.CreateStream( out streamPtr, flag)); responsibleForHandle = true; }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (Pointer == IntPtr.Zero) { return; } CudaException.ThrowIfFailed(CudaAPI.Current.FreeMemory(Pointer)); Pointer = IntPtr.Zero; }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (responsibleForHandle && streamPtr != IntPtr.Zero) { CudaException.ThrowIfFailed( CurrentAPI.DestroyStream(streamPtr)); streamPtr = IntPtr.Zero; } base.Dispose(disposing); }
/// <summary cref="MemoryBuffer.MemSetToZero(AcceleratorStream)"/> public override void MemSetToZero(AcceleratorStream stream) { var binding = Accelerator.BindScoped(); CudaException.ThrowIfFailed(CudaAPI.Current.Memset( NativePtr, 0, new IntPtr(LengthInBytes), stream)); binding.Recover(); }
/// <summary> /// Constructs a new Cuda accelerator. /// </summary> /// <param name="context">The ILGPU context.</param> /// <param name="deviceId">The target device id.</param> /// <param name="acceleratorFlags">The accelerator flags.</param> public CudaAccelerator( Context context, int deviceId, CudaAcceleratorFlags acceleratorFlags) : base(context, AcceleratorType.Cuda) { CudaException.ThrowIfFailed( CurrentAPI.CreateContext(out contextPtr, acceleratorFlags, deviceId)); DeviceId = deviceId; SetupAccelerator(); }
/// <summary> /// Disposes this Cuda stream. /// </summary> protected override void DisposeAcceleratorObject(bool disposing) { if (!responsibleForHandle || streamPtr == IntPtr.Zero) { return; } CudaException.VerifyDisposed( disposing, CurrentAPI.DestroyStream(streamPtr)); streamPtr = IntPtr.Zero; }
/// <inheritdoc/> public override void Synchronize() { var errorStatus = CurrentAPI.QueryEvent(EventPtr); if (errorStatus == CudaError.CUDA_ERROR_NOT_READY) { CudaException.ThrowIfFailed(CurrentAPI.SynchronizeEvent(EventPtr)); } else { CudaException.ThrowIfFailed(errorStatus); } }
/// <inheritdoc/> public override void Synchronize() { using var binding = Accelerator.BindScoped(); var errorStatus = CurrentAPI.QueryEvent(EventPtr); if (errorStatus == CudaError.CUDA_ERROR_NOT_READY) { CudaException.ThrowIfFailed(CurrentAPI.SynchronizeEvent(EventPtr)); } else { CudaException.ThrowIfFailed(errorStatus); } }
/// <summary> /// Resolves the memory type of the given device pointer. /// </summary> /// <param name="value">The device pointer to check.</param> /// <returns>The resolved memory type</returns> public static unsafe CudaMemoryType GetCudaMemoryType(IntPtr value) { int data = 0; var err = CurrentAPI.GetPointerAttribute( new IntPtr(Unsafe.AsPointer(ref data)), PointerAttribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, value); if (err == CudaError.CUDA_ERROR_INVALID_VALUE) { return(CudaMemoryType.None); } CudaException.ThrowIfFailed(err); return((CudaMemoryType)data); }
/// <inheritdoc/> protected internal override unsafe void MemSetInternal( AcceleratorStream stream, byte value, long offsetInBytes, long lengthInBytes) { var binding = Accelerator.BindScoped(); CudaException.ThrowIfFailed( CurrentAPI.Memset( new IntPtr(NativePtr.ToInt64() + offsetInBytes), value, new IntPtr(lengthInBytes), stream)); binding.Recover(); }
/// <summary> /// Resolves the memory type of the given device pointer. /// </summary> /// <param name="value">The device pointer to check.</param> /// <returns>The resolved memory type</returns> public static unsafe CudaMemoryType GetCudaMemoryType(IntPtr value) { // This functionality requires unified addresses (X64) Backends.Backend.EnsureRunningOnPlatform(TargetPlatform.X64); int data = 0; var err = CurrentAPI.GetPointerAttribute( new IntPtr(Unsafe.AsPointer(ref data)), PointerAttribute.CU_POINTER_ATTRIBUTE_MEMORY_TYPE, value); if (err == CudaError.CUDA_ERROR_INVALID_VALUE) { return(CudaMemoryType.None); } CudaException.ThrowIfFailed(err); return((CudaMemoryType)data); }
/// <summary> /// Setups all required settings. /// </summary> private void SetupAccelerator() { Bind(); CudaException.ThrowIfFailed( CurrentAPI.GetDeviceName(out string name, DeviceId)); Name = name; DefaultStream = new CudaStream(this, IntPtr.Zero); CudaException.ThrowIfFailed( CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId)); MemorySize = total; // Resolve max grid size MaxGridSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, DeviceId)); // Resolve max group size MaxGroupSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, DeviceId)); // Resolve max threads per group MaxNumThreadsPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, DeviceId); // Resolve max shared memory per block MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, DeviceId); // Resolve total constant memory MaxConstantMemory = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId); // Resolve clock rate ClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId); // Resolve warp size WarpSize = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId); // Resolve number of multiprocessors NumMultiprocessors = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId); // Result max number of threads per multiprocessor MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, DeviceId); // Resolve cache configuration CudaException.ThrowIfFailed( CurrentAPI.GetSharedMemoryConfig(out sharedMemoryConfiguration)); CudaException.ThrowIfFailed( CurrentAPI.GetCacheConfig(out cacheConfiguration)); // Setup architecture and backend CudaException.ThrowIfFailed( CurrentAPI.GetDeviceComputeCapability( out int major, out int minor, DeviceId)); Architecture = PTXArchitectureUtils.GetArchitecture(major, minor); CudaException.ThrowIfFailed( CurrentAPI.GetDriverVersion(out var driverVersion)); InstructionSet = GetInstructionSet(Architecture, driverVersion); Init(new PTXBackend( Context, Architecture, InstructionSet)); }
/// <summary cref="AcceleratorStream.Synchronize"/> public override void Synchronize() { CudaException.ThrowIfFailed(CudaAPI.Current.SynchronizeStream(streamPtr)); }
/// <summary> /// Setups all required settings. /// </summary> private void SetupAccelerator() { Bind(); CudaException.ThrowIfFailed( CurrentAPI.GetDeviceName(out string name, DeviceId)); Name = name; DefaultStream = new CudaStream(this, IntPtr.Zero, false); CudaException.ThrowIfFailed( CurrentAPI.GetTotalDeviceMemory(out long total, DeviceId)); MemorySize = total; // Resolve max grid size MaxGridSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, DeviceId)); // Resolve max group size MaxGroupSize = new Index3( CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y, DeviceId), CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z, DeviceId)); // Resolve max threads per group MaxNumThreadsPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, DeviceId); // Resolve max shared memory per block MaxSharedMemoryPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, DeviceId); // Resolve total constant memory MaxConstantMemory = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, DeviceId); // Resolve clock rate ClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_CLOCK_RATE, DeviceId) / 1000; // Resolve memory clock rate MemoryClockRate = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, DeviceId) / 1000; // Resolve the bus width MemoryBusWidth = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, DeviceId); // Resolve warp size WarpSize = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_WARP_SIZE, DeviceId); // Resolve number of multiprocessors NumMultiprocessors = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, DeviceId); // Result max number of threads per multiprocessor MaxNumThreadsPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR, DeviceId); // Resolve the L2 cache size L2CacheSize = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, DeviceId); // Resolve the maximum amount of shared memory per multiprocessor MaxSharedMemoryPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_MULTIPROCESSOR, DeviceId); // Resolve the total number of registers per multiprocessor TotalNumRegistersPerMultiprocessor = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_MULTIPROCESSOR, DeviceId); // Resolve the total number of registers per group TotalNumRegistersPerGroup = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, DeviceId); // Resolve the max memory pitch MaxMemoryPitch = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MAX_PITCH, DeviceId); // Resolve the number of concurrent copy engines NumConcurrentCopyEngines = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, DeviceId); // Resolve whether this device has ECC support HasECCSupport = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_ECC_ENABLED, DeviceId) != 0; // Resolve whether this device supports managed memory SupportsManagedMemory = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY, DeviceId) != 0; // Resolve whether this device supports compute preemption SupportsComputePreemption = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED, DeviceId) != 0; // Resolve the current driver mode DriverMode = (DeviceDriverMode)CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_TCC_DRIVER, DeviceId); // Resolve the PCI domain id PCIDomainId = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, DeviceId); // Resolve the PCI device id PCIBusId = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, DeviceId); // Resolve the PCI device id PCIDeviceId = CurrentAPI.GetDeviceAttribute( DeviceAttribute.CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, DeviceId); // Resolve cache configuration CudaException.ThrowIfFailed( CurrentAPI.GetSharedMemoryConfig(out sharedMemoryConfiguration)); CudaException.ThrowIfFailed( CurrentAPI.GetCacheConfig(out cacheConfiguration)); // Setup architecture and backend CudaException.ThrowIfFailed( CurrentAPI.GetDeviceComputeCapability( out int major, out int minor, DeviceId)); Architecture = PTXArchitectureUtils.GetArchitecture(major, minor); CudaException.ThrowIfFailed( CurrentAPI.GetDriverVersion(out var driverVersion)); DriverVersion = driverVersion; InstructionSet = GetInstructionSet(Architecture, driverVersion); base.Capabilities = new CudaCapabilityContext(Architecture); Init(new PTXBackend( Context, Capabilities, Architecture, InstructionSet)); }