/// <summary> /// Entry point for a single processing thread. /// </summary> /// <param name="arg">The relative thread index.</param> private void ExecuteThread(object arg) { var relativeThreadIdx = (int)arg; CPUAcceleratorTask task = null; for (; ;) { lock (taskSynchronizationObject) { while ((currentTask == null | currentTask == task) & running) { Monitor.Wait(taskSynchronizationObject); } if (!running) { break; } task = currentTask; } Debug.Assert(task != null, "Invalid task"); var groupThreadSize = ComputeNumGroupThreads(task.GroupDim.Size); var runtimeGroupThreadIdx = relativeThreadIdx % groupThreadSize; var runtimeGroupIdx = relativeThreadIdx / groupThreadSize; var numRuntimeGroups = NumThreads / groupThreadSize; var numUsedThreads = numRuntimeGroups * groupThreadSize; Debug.Assert(numUsedThreads > 0, "Invalid group size"); // Check whether we are an active thread if (relativeThreadIdx < numUsedThreads) { // Bind the context to the current thread var groupContext = groupContexts[runtimeGroupIdx]; groupContext.MakeCurrent(); var runtimeDimension = task.RuntimeDimension; var chunkSize = (runtimeDimension + numRuntimeGroups - 1) / numRuntimeGroups; chunkSize = ((chunkSize + groupThreadSize - 1) / groupThreadSize) * groupThreadSize; var chunkOffset = chunkSize * runtimeGroupIdx; // Setup current indices CPURuntimeThreadContext.SetupDimensions(task.GridDim, task.GroupDim); // Prepare execution groupContext.WaitForNextThreadIndex(); var targetDimension = Math.Min(task.UserDimension, runtimeDimension); task.Execute( groupContext, runtimeGroupThreadIdx, groupThreadSize, chunkSize, chunkOffset, targetDimension); } finishedEvent.SignalAndWait(); } }
/// <summary> /// Entry point for a single processing thread. /// </summary> /// <param name="arg">The absolute thread index.</param> private void ExecuteThread(object arg) { // Get the current thread information int absoluteThreadIndex = (int)arg; int threadIdx = absoluteThreadIndex % MaxNumThreadsPerMultiprocessor; var processorIdx = absoluteThreadIndex / MaxNumThreadsPerMultiprocessor; var processorBarrier = processorBarriers[processorIdx]; bool isMainThread = threadIdx == 0; // Setup a new thread context for this thread and initialize the lane index int laneIdx = threadIdx % WarpSize; var threadContext = new CPURuntimeThreadContext(laneIdx) { LinearGroupIndex = threadIdx }; threadContext.MakeCurrent(); // Setup the current warp context as it always stays the same int warpIdx = threadIdx / WarpSize; bool isMainWarpThread = threadIdx == 0; var warpContext = warpContexts[processorIdx, warpIdx]; warpContext.MakeCurrent(); // Setup the current group context as it always stays the same var groupContext = groupContexts[processorIdx]; groupContext.MakeCurrent(); CPUAcceleratorTask task = null; for (; ;) { // Get a new task to execute lock (taskSynchronizationObject) { while ((currentTask == null | currentTask == task) & running) { Monitor.Wait(taskSynchronizationObject); } if (!running) { break; } task = currentTask; } Debug.Assert(task != null, "Invalid task"); // Setup the current group index threadContext.GroupIndex = Index3D.ReconstructIndex( threadIdx, task.GroupDim); // Wait for all threads of all multiprocessors to arrive here Thread.MemoryBarrier(); processorBarrier.SignalAndWait(); try { // If we are an active group thread int groupSize = task.GroupDim.Size; if (threadIdx < groupSize) { try { var launcher = task.KernelExecutionDelegate; // Split the grid into different chunks that will be processed // by the available multiprocessors int linearGridDim = task.GridDim.Size; int gridChunkSize = IntrinsicMath.DivRoundUp( linearGridDim, NumMultiprocessors); int gridOffset = gridChunkSize * processorIdx; int linearUserDim = task.TotalUserDim.Size; for ( int i = gridOffset, e = gridOffset + gridChunkSize; i < e; ++i) { groupContext.BeginThreadProcessing(); try { // Setup the current grid index threadContext.GridIndex = Index3D.ReconstructIndex( i, task.GridDim); // Invoke the actual kernel launcher int globalIndex = i * groupSize + threadIdx; if (globalIndex < linearUserDim) { launcher(task, globalIndex); } } finally { groupContext.EndThreadProcessing(); } } } finally { // This thread has already finished processing groupContext.FinishThreadProcessing(); warpContext.FinishThreadProcessing(); } } } finally { // Wait for all threads of all multiprocessors to arrive here processorBarrier.SignalAndWait(); // If we reach this point and we are the main thread, notify the // parent accelerator instance if (isMainThread) { finishedEventPerMultiprocessor.SignalAndWait(); } } } }
/// <summary> /// Makes the current context the active one for this thread. /// </summary> internal void MakeCurrent() => currentContext = this;
/// <summary> /// Entry point for a single processing thread. /// </summary> /// <param name="arg">The absolute thread index.</param> private void ExecuteThread(object arg) { // Get the current thread information int absoluteThreadIndex = (int)arg; int threadIdx = absoluteThreadIndex % MaxNumThreadsPerMultiprocessor; bool isMainThread = threadIdx == 0; // Setup a new thread context for this thread and initialize the lane index int laneIdx = threadIdx % WarpSize; int warpIdx = threadIdx / WarpSize; var threadContext = new CPURuntimeThreadContext(laneIdx, warpIdx) { LinearGroupIndex = threadIdx }; threadContext.MakeCurrent(); // Setup the current warp context as it always stays the same bool isMainWarpThread = threadIdx == 0; var warpContext = warpContexts[warpIdx]; warpContext.MakeCurrent(); // Setup the current group context as it always stays the same groupContext.MakeCurrent(); CPUAcceleratorTask task = null; for (; ;) { // Get a new task to execute (if any) if (!Accelerator.WaitForTask(ref task)) { break; } // Setup the current group index threadContext.GroupIndex = Stride3D.DenseXY.ReconstructFromElementIndex( threadIdx, task.GroupDim); // Wait for all threads of all multiprocessors to arrive here Thread.MemoryBarrier(); processorBarrier.SignalAndWait(); try { // If we are an active group thread int groupSize = task.GroupDim.Size; if (threadIdx < groupSize) { try { var launcher = task.KernelExecutionDelegate; // Split the grid into different chunks that will be processed // by the available multiprocessors int linearGridDim = task.GridDim.Size; int gridChunkSize = IntrinsicMath.DivRoundUp( linearGridDim, Accelerator.NumMultiprocessors); int gridOffset = gridChunkSize * ProcessorIndex; int linearUserDim = task.TotalUserDim.Size; for ( int i = gridOffset, e = gridOffset + gridChunkSize; i < e; ++i) { BeginThreadProcessing(); try { // Setup the current grid index threadContext.GridIndex = Stride3D.DenseXY .ReconstructFromElementIndex( i, task.GridDim); // Invoke the actual kernel launcher int globalIndex = i * groupSize + threadIdx; if (globalIndex < linearUserDim) { launcher(task, globalIndex); } } finally { EndThreadProcessing(); } } } finally { // This thread has already finished processing FinishThreadProcessing(); } } } finally { // Wait for all threads of all multiprocessors to arrive here processorBarrier.SignalAndWait(); // If we reach this point and we are the main thread, notify the // parent accelerator instance if (isMainThread) { Accelerator.FinishTaskProcessing(); } } } }