/// <summary> /// Setups all runtime classes related to <see cref="CPURuntimeGroupContext"/> /// and <see cref="CPURuntimeWarpContext"/>. /// </summary> /// <param name="task">The current CPU task.</param> private void SetupRuntimeClasses(CPUAcceleratorTask task) { // Setup groups contexts int groupSize = task.GroupDim.Size; int numWarps = IntrinsicMath.DivRoundUp(groupSize, WarpSize); if (numWarps * WarpSize > MaxNumThreadsPerGroup) { throw new NotSupportedException(string.Format( RuntimeErrorMessages.NotSupportedTotalGroupSize, MaxNumThreadsPerGroup)); } for (int i = 0, e = NumMultiprocessors; i < e; ++i) { // Initialize the associated group context var context = groupContexts[i]; context.Initialize( task.GridDim, task.GroupDim, task.DynamicSharedMemoryConfig); // Initialize each involved warp context for (int j = 0, e2 = numWarps - 1; j < e2; ++j) { warpContexts[i, j].Initialize(WarpSize); } int lastWarpSize = groupSize % WarpSize == 0 ? WarpSize : groupSize % WarpSize; warpContexts[i, numWarps - 1].Initialize(lastWarpSize); } // Setup sequential execution objects }
/// <summary cref="DisposeBase.Dispose(bool)"/> protected override void Dispose(bool disposing) { if (disposing) { lock (taskSynchronizationObject) { running = false; currentTask = null; Monitor.PulseAll(taskSynchronizationObject); } foreach (var thread in threads) { thread.Join(); } threads = null; foreach (var group in groupContexts) { group.Dispose(); } groupContexts = null; finishedEvent.Dispose(); } base.Dispose(disposing); }
/// <summary> /// Entry point for a single processing thread. /// </summary> /// <param name="arg">The relative thread index.</param> private void ExecuteThread(object arg) { var relativeThreadIdx = (int)arg; CPUAcceleratorTask task = null; for (; ;) { lock (taskSynchronizationObject) { while ((currentTask == null | currentTask == task) & running) { Monitor.Wait(taskSynchronizationObject); } if (!running) { break; } task = currentTask; } Debug.Assert(task != null, "Invalid task"); var groupThreadSize = ComputeNumGroupThreads(task.GroupDim.Size); var runtimeGroupThreadIdx = relativeThreadIdx % groupThreadSize; var runtimeGroupIdx = relativeThreadIdx / groupThreadSize; var numRuntimeGroups = NumThreads / groupThreadSize; var numUsedThreads = numRuntimeGroups * groupThreadSize; Debug.Assert(numUsedThreads > 0, "Invalid group size"); // Check whether we are an active thread if (relativeThreadIdx < numUsedThreads) { // Bind the context to the current thread var groupContext = groupContexts[runtimeGroupIdx]; groupContext.MakeCurrent(); var runtimeDimension = task.RuntimeDimension; var chunkSize = (runtimeDimension + numRuntimeGroups - 1) / numRuntimeGroups; chunkSize = ((chunkSize + groupThreadSize - 1) / groupThreadSize) * groupThreadSize; var chunkOffset = chunkSize * runtimeGroupIdx; // Setup current indices CPURuntimeThreadContext.SetupDimensions(task.GridDim, task.GroupDim); // Prepare execution groupContext.WaitForNextThreadIndex(); var targetDimension = Math.Min(task.UserDimension, runtimeDimension); Debug.Assert(groupContext.SharedMemory.LengthInBytes == task.SharedMemSize, "Invalid shared-memory initialization"); task.Execute( groupContext, runtimeGroupThreadIdx, groupThreadSize, chunkSize, chunkOffset, targetDimension); } finishedEvent.SignalAndWait(); } }
/// <summary> /// Entry point for a single processing thread. /// </summary> /// <param name="arg">The absolute thread index.</param> private void ExecuteThread(object arg) { // Get the current thread information int absoluteThreadIndex = (int)arg; int threadIdx = absoluteThreadIndex % MaxNumThreadsPerMultiprocessor; var processorIdx = absoluteThreadIndex / MaxNumThreadsPerMultiprocessor; var processorBarrier = processorBarriers[processorIdx]; bool isMainThread = threadIdx == 0; // Setup a new thread context for this thread and initialize the lane index int laneIdx = threadIdx % WarpSize; var threadContext = new CPURuntimeThreadContext(laneIdx) { LinearGroupIndex = threadIdx }; threadContext.MakeCurrent(); // Setup the current warp context as it always stays the same int warpIdx = threadIdx / WarpSize; bool isMainWarpThread = threadIdx == 0; var warpContext = warpContexts[processorIdx, warpIdx]; warpContext.MakeCurrent(); // Setup the current group context as it always stays the same var groupContext = groupContexts[processorIdx]; groupContext.MakeCurrent(); CPUAcceleratorTask task = null; for (; ;) { // Get a new task to execute lock (taskSynchronizationObject) { while ((currentTask == null | currentTask == task) & running) { Monitor.Wait(taskSynchronizationObject); } if (!running) { break; } task = currentTask; } Debug.Assert(task != null, "Invalid task"); // Setup the current group index threadContext.GroupIndex = Index3D.ReconstructIndex( threadIdx, task.GroupDim); // Wait for all threads of all multiprocessors to arrive here Thread.MemoryBarrier(); processorBarrier.SignalAndWait(); try { // If we are an active group thread int groupSize = task.GroupDim.Size; if (threadIdx < groupSize) { try { var launcher = task.KernelExecutionDelegate; // Split the grid into different chunks that will be processed // by the available multiprocessors int linearGridDim = task.GridDim.Size; int gridChunkSize = IntrinsicMath.DivRoundUp( linearGridDim, NumMultiprocessors); int gridOffset = gridChunkSize * processorIdx; int linearUserDim = task.TotalUserDim.Size; for ( int i = gridOffset, e = gridOffset + gridChunkSize; i < e; ++i) { groupContext.BeginThreadProcessing(); try { // Setup the current grid index threadContext.GridIndex = Index3D.ReconstructIndex( i, task.GridDim); // Invoke the actual kernel launcher int globalIndex = i * groupSize + threadIdx; if (globalIndex < linearUserDim) { launcher(task, globalIndex); } } finally { groupContext.EndThreadProcessing(); } } } finally { // This thread has already finished processing groupContext.FinishThreadProcessing(); warpContext.FinishThreadProcessing(); } } } finally { // Wait for all threads of all multiprocessors to arrive here processorBarrier.SignalAndWait(); // If we reach this point and we are the main thread, notify the // parent accelerator instance if (isMainThread) { finishedEventPerMultiprocessor.SignalAndWait(); } } } }
/// <summary> /// Entry point for a single processing thread. /// </summary> /// <param name="arg">The relative thread index.</param> private void ExecuteThread(object arg) { var relativeThreadIdx = (int)arg; var warpContext = warpContexts[relativeThreadIdx / WarpSize]; Debug.Assert(warpContext != null, "Invalid warp context"); warpContext.MakeCurrent(); CPUAcceleratorTask task = null; for (;;) { lock (taskSynchronizationObject) { while ((currentTask == null | currentTask == task) & running) { Monitor.Wait(taskSynchronizationObject); } if (!running) { break; } task = currentTask; } Debug.Assert(task != null, "Invalid task"); var groupThreadSize = ComputeNumGroupThreads(task.GroupDim.Size); var runtimeGroupThreadIdx = relativeThreadIdx % groupThreadSize; warpContext.Initialize(runtimeGroupThreadIdx, out int runtimeThreadOffset); var runtimeGroupIdx = relativeThreadIdx / groupThreadSize; var numRuntimeGroups = NumThreads / groupThreadSize; var numUsedThreads = numRuntimeGroups * groupThreadSize; Debug.Assert(numUsedThreads > 0, "Invalid group size"); // Check whether we are an active thread if (relativeThreadIdx < numUsedThreads) { // Bind the context to the current thread groupContexts[runtimeGroupIdx].MakeCurrent(out ArrayView <byte> sharedMemory, out Barrier groupBarrier); var runtimeDimension = task.RuntimeDimension; var chunkSize = (runtimeDimension + numRuntimeGroups - 1) / numRuntimeGroups; chunkSize = ((chunkSize + groupThreadSize - 1) / groupThreadSize) * groupThreadSize; var chunkOffset = chunkSize * runtimeGroupIdx; var targetDimension = Math.Min(task.UserDimension, runtimeDimension); Debug.Assert(sharedMemory.LengthInBytes == task.SharedMemSize, "Invalid shared-memory initialization"); task.Execute( groupBarrier, sharedMemory, runtimeThreadOffset, groupThreadSize, numRuntimeGroups, numUsedThreads, chunkSize, chunkOffset, targetDimension); } finishedEvent.SignalAndWait(); } }
/// <summary> /// Begins a accelerator task. /// </summary> /// <param name="task">The task to launch.</param> protected abstract void BeginLaunch(CPUAcceleratorTask task);
/// <summary> /// Entry point for a single processing thread. /// </summary> /// <param name="arg">The absolute thread index.</param> private void ExecuteThread(object arg) { // Get the current thread information int absoluteThreadIndex = (int)arg; int threadIdx = absoluteThreadIndex % MaxNumThreadsPerMultiprocessor; bool isMainThread = threadIdx == 0; // Setup a new thread context for this thread and initialize the lane index int laneIdx = threadIdx % WarpSize; int warpIdx = threadIdx / WarpSize; var threadContext = new CPURuntimeThreadContext(laneIdx, warpIdx) { LinearGroupIndex = threadIdx }; threadContext.MakeCurrent(); // Setup the current warp context as it always stays the same bool isMainWarpThread = threadIdx == 0; var warpContext = warpContexts[warpIdx]; warpContext.MakeCurrent(); // Setup the current group context as it always stays the same groupContext.MakeCurrent(); CPUAcceleratorTask task = null; for (; ;) { // Get a new task to execute (if any) if (!Accelerator.WaitForTask(ref task)) { break; } // Setup the current group index threadContext.GroupIndex = Stride3D.DenseXY.ReconstructFromElementIndex( threadIdx, task.GroupDim); // Wait for all threads of all multiprocessors to arrive here Thread.MemoryBarrier(); processorBarrier.SignalAndWait(); try { // If we are an active group thread int groupSize = task.GroupDim.Size; if (threadIdx < groupSize) { try { var launcher = task.KernelExecutionDelegate; // Split the grid into different chunks that will be processed // by the available multiprocessors int linearGridDim = task.GridDim.Size; int gridChunkSize = IntrinsicMath.DivRoundUp( linearGridDim, Accelerator.NumMultiprocessors); int gridOffset = gridChunkSize * ProcessorIndex; int linearUserDim = task.TotalUserDim.Size; for ( int i = gridOffset, e = gridOffset + gridChunkSize; i < e; ++i) { BeginThreadProcessing(); try { // Setup the current grid index threadContext.GridIndex = Stride3D.DenseXY .ReconstructFromElementIndex( i, task.GridDim); // Invoke the actual kernel launcher int globalIndex = i * groupSize + threadIdx; if (globalIndex < linearUserDim) { launcher(task, globalIndex); } } finally { EndThreadProcessing(); } } } finally { // This thread has already finished processing FinishThreadProcessing(); } } } finally { // Wait for all threads of all multiprocessors to arrive here processorBarrier.SignalAndWait(); // If we reach this point and we are the main thread, notify the // parent accelerator instance if (isMainThread) { Accelerator.FinishTaskProcessing(); } } } }