Beispiel #1
0
        /// <summary cref="Accelerator.EstimateGroupSizeInternal(
        /// Kernel, int, int, out int)"/>
        protected override int EstimateGroupSizeInternal(
            Kernel kernel,
            int dynamicSharedMemorySizeInBytes,
            int maxGroupSize,
            out int minGridSize)
        {
            if (dynamicSharedMemorySizeInBytes > 0)
            {
                throw new ArgumentOutOfRangeException(
                          nameof(dynamicSharedMemorySizeInBytes));
            }

            if (maxGroupSize < 1)
            {
                maxGroupSize = MaxNumThreadsPerGroup;
            }

            var clKernel            = kernel as CLKernel;
            var workGroupSizeNative = CurrentAPI.GetKernelWorkGroupInfo <IntPtr>(
                clKernel.KernelPtr,
                DeviceId,
                CLKernelWorkGroupInfoType.CL_KERNEL_WORK_GROUP_SIZE);
            int workGroupSize = workGroupSizeNative.ToInt32();

            workGroupSize = IntrinsicMath.Min(workGroupSize, maxGroupSize);
            minGridSize   = IntrinsicMath.DivRoundUp(MaxNumThreads, workGroupSize);

            return(workGroupSize);
        }
Beispiel #2
0
        /// <summary>
        /// Setups all runtime classes related to <see cref="CPURuntimeGroupContext"/>
        /// and <see cref="CPURuntimeWarpContext"/>.
        /// </summary>
        /// <param name="task">The current CPU task.</param>
        private void SetupRuntimeClasses(CPUAcceleratorTask task)
        {
            // Setup groups contexts
            int groupSize = task.GroupDim.Size;
            int numWarps  = IntrinsicMath.DivRoundUp(groupSize, WarpSize);

            if (numWarps * WarpSize > MaxNumThreadsPerGroup)
            {
                throw new NotSupportedException(string.Format(
                                                    RuntimeErrorMessages.NotSupportedTotalGroupSize,
                                                    MaxNumThreadsPerGroup));
            }

            // Initialize the associated group context
            groupContext.Initialize(
                task.GridDim,
                task.GroupDim,
                task.DynamicSharedMemoryConfig);

            // Initialize each involved warp context
            for (int i = 0, e = numWarps - 1; i < e; ++i)
            {
                warpContexts[i].Initialize(WarpSize);
            }

            int lastWarpSize = groupSize % WarpSize == 0
                ? WarpSize
                : groupSize % WarpSize;

            warpContexts[numWarps - 1].Initialize(lastWarpSize);
        }
Beispiel #3
0
 public static long DivRoundUp(long numerator, long denominator) =>
 IntrinsicMath.DivRoundUp(numerator, denominator);
Beispiel #4
0
 public static int DivRoundUp(int numerator, int denominator) =>
 IntrinsicMath.DivRoundUp(numerator, denominator);
Beispiel #5
0
        /// <summary>
        /// Entry point for a single processing thread.
        /// </summary>
        /// <param name="arg">The absolute thread index.</param>
        private void ExecuteThread(object arg)
        {
            // Get the current thread information
            int absoluteThreadIndex = (int)arg;
            int threadIdx           = absoluteThreadIndex % MaxNumThreadsPerMultiprocessor;
            var processorIdx        = absoluteThreadIndex / MaxNumThreadsPerMultiprocessor;

            var  processorBarrier = processorBarriers[processorIdx];
            bool isMainThread     = threadIdx == 0;

            // Setup a new thread context for this thread and initialize the lane index
            int laneIdx       = threadIdx % WarpSize;
            var threadContext = new CPURuntimeThreadContext(laneIdx)
            {
                LinearGroupIndex = threadIdx
            };

            threadContext.MakeCurrent();

            // Setup the current warp context as it always stays the same
            int  warpIdx          = threadIdx / WarpSize;
            bool isMainWarpThread = threadIdx == 0;
            var  warpContext      = warpContexts[processorIdx, warpIdx];

            warpContext.MakeCurrent();

            // Setup the current group context as it always stays the same
            var groupContext = groupContexts[processorIdx];

            groupContext.MakeCurrent();

            CPUAcceleratorTask task = null;

            for (; ;)
            {
                // Get a new task to execute
                lock (taskSynchronizationObject)
                {
                    while ((currentTask == null | currentTask == task) & running)
                    {
                        Monitor.Wait(taskSynchronizationObject);
                    }
                    if (!running)
                    {
                        break;
                    }
                    task = currentTask;
                }
                Debug.Assert(task != null, "Invalid task");

                // Setup the current group index
                threadContext.GroupIndex = Index3D.ReconstructIndex(
                    threadIdx,
                    task.GroupDim);

                // Wait for all threads of all multiprocessors to arrive here
                Thread.MemoryBarrier();
                processorBarrier.SignalAndWait();

                try
                {
                    // If we are an active group thread
                    int groupSize = task.GroupDim.Size;
                    if (threadIdx < groupSize)
                    {
                        try
                        {
                            var launcher = task.KernelExecutionDelegate;

                            // Split the grid into different chunks that will be processed
                            // by the available multiprocessors
                            int linearGridDim = task.GridDim.Size;
                            int gridChunkSize = IntrinsicMath.DivRoundUp(
                                linearGridDim,
                                NumMultiprocessors);
                            int gridOffset    = gridChunkSize * processorIdx;
                            int linearUserDim = task.TotalUserDim.Size;
                            for (
                                int i = gridOffset, e = gridOffset + gridChunkSize;
                                i < e;
                                ++i)
                            {
                                groupContext.BeginThreadProcessing();
                                try
                                {
                                    // Setup the current grid index
                                    threadContext.GridIndex = Index3D.ReconstructIndex(
                                        i,
                                        task.GridDim);

                                    // Invoke the actual kernel launcher
                                    int globalIndex = i * groupSize + threadIdx;
                                    if (globalIndex < linearUserDim)
                                    {
                                        launcher(task, globalIndex);
                                    }
                                }
                                finally
                                {
                                    groupContext.EndThreadProcessing();
                                }
                            }
                        }
                        finally
                        {
                            // This thread has already finished processing
                            groupContext.FinishThreadProcessing();
                            warpContext.FinishThreadProcessing();
                        }
                    }
                }
                finally
                {
                    // Wait for all threads of all multiprocessors to arrive here
                    processorBarrier.SignalAndWait();

                    // If we reach this point and we are the main thread, notify the
                    // parent accelerator instance
                    if (isMainThread)
                    {
                        finishedEventPerMultiprocessor.SignalAndWait();
                    }
                }
            }
        }
Beispiel #6
0
        /// <summary>
        /// Entry point for a single processing thread.
        /// </summary>
        /// <param name="arg">The absolute thread index.</param>
        private void ExecuteThread(object arg)
        {
            // Get the current thread information
            int absoluteThreadIndex = (int)arg;
            int threadIdx           = absoluteThreadIndex % MaxNumThreadsPerMultiprocessor;

            bool isMainThread = threadIdx == 0;

            // Setup a new thread context for this thread and initialize the lane index
            int laneIdx       = threadIdx % WarpSize;
            int warpIdx       = threadIdx / WarpSize;
            var threadContext = new CPURuntimeThreadContext(laneIdx, warpIdx)
            {
                LinearGroupIndex = threadIdx
            };

            threadContext.MakeCurrent();

            // Setup the current warp context as it always stays the same
            bool isMainWarpThread = threadIdx == 0;
            var  warpContext      = warpContexts[warpIdx];

            warpContext.MakeCurrent();

            // Setup the current group context as it always stays the same
            groupContext.MakeCurrent();

            CPUAcceleratorTask task = null;

            for (; ;)
            {
                // Get a new task to execute (if any)
                if (!Accelerator.WaitForTask(ref task))
                {
                    break;
                }

                // Setup the current group index
                threadContext.GroupIndex = Stride3D.DenseXY.ReconstructFromElementIndex(
                    threadIdx,
                    task.GroupDim);

                // Wait for all threads of all multiprocessors to arrive here
                Thread.MemoryBarrier();
                processorBarrier.SignalAndWait();

                try
                {
                    // If we are an active group thread
                    int groupSize = task.GroupDim.Size;
                    if (threadIdx < groupSize)
                    {
                        try
                        {
                            var launcher = task.KernelExecutionDelegate;

                            // Split the grid into different chunks that will be processed
                            // by the available multiprocessors
                            int linearGridDim = task.GridDim.Size;
                            int gridChunkSize = IntrinsicMath.DivRoundUp(
                                linearGridDim,
                                Accelerator.NumMultiprocessors);
                            int gridOffset    = gridChunkSize * ProcessorIndex;
                            int linearUserDim = task.TotalUserDim.Size;
                            for (
                                int i = gridOffset, e = gridOffset + gridChunkSize;
                                i < e;
                                ++i)
                            {
                                BeginThreadProcessing();
                                try
                                {
                                    // Setup the current grid index
                                    threadContext.GridIndex = Stride3D.DenseXY
                                                              .ReconstructFromElementIndex(
                                        i,
                                        task.GridDim);

                                    // Invoke the actual kernel launcher
                                    int globalIndex = i * groupSize + threadIdx;
                                    if (globalIndex < linearUserDim)
                                    {
                                        launcher(task, globalIndex);
                                    }
                                }
                                finally
                                {
                                    EndThreadProcessing();
                                }
                            }
                        }
                        finally
                        {
                            // This thread has already finished processing
                            FinishThreadProcessing();
                        }
                    }
                }
                finally
                {
                    // Wait for all threads of all multiprocessors to arrive here
                    processorBarrier.SignalAndWait();

                    // If we reach this point and we are the main thread, notify the
                    // parent accelerator instance
                    if (isMainThread)
                    {
                        Accelerator.FinishTaskProcessing();
                    }
                }
            }
        }