Example #1
0
        /// <summary cref="KernelAccelerator{TCompiledKernel, TKernel}
        /// .GenerateKernelLauncherMethod(TCompiledKernel, int)"/>
        protected override MethodInfo GenerateKernelLauncherMethod(
            CLCompiledKernel kernel,
            int customGroupSize)
        {
            var entryPoint = kernel.EntryPoint;

            AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint);

            // Add support for by ref parameters
            if (entryPoint.HasByRefParameters)
            {
                throw new NotSupportedException(
                          ErrorMessages.NotSupportedByRefKernelParameters);
            }

            var launcher = entryPoint.CreateLauncherMethod(Context);
            var emitter  = new ILEmitter(launcher.ILGenerator);

            // Load kernel instance
            var kernelLocal = emitter.DeclareLocal(typeof(CLKernel));

            KernelLauncherBuilder.EmitLoadKernelArgument <CLKernel, ILEmitter>(
                Kernel.KernelInstanceParamIdx,
                emitter);
            emitter.Emit(LocalOperation.Store, kernelLocal);

            // Map all kernel arguments
            var argumentMapper = Backend.ArgumentMapper;

            argumentMapper.Map(emitter, kernelLocal, entryPoint);

            // Load stream
            KernelLauncherBuilder.EmitLoadAcceleratorStream <CLStream, ILEmitter>(
                Kernel.KernelStreamParamIdx,
                emitter);

            // Load kernel
            emitter.Emit(LocalOperation.Load, kernelLocal);

            // Load dimensions
            KernelLauncherBuilder.EmitLoadRuntimeKernelConfig(
                entryPoint,
                emitter,
                Kernel.KernelParamDimensionIdx,
                customGroupSize);

            // Dispatch kernel
            emitter.EmitCall(LaunchKernelMethod);

            // Emit ThrowIfFailed
            emitter.EmitCall(ThrowIfFailedMethod);

            emitter.Emit(OpCodes.Ret);
            emitter.Finish();

            return(launcher.Finish());
        }
Example #2
0
        /// <summary>
        /// Generates a dynamic kernel-launcher method that will be just-in-time compiled
        /// during the first invocation. Using the generated launcher lowers the overhead
        /// for kernel launching dramatically, since unnecessary operations (like boxing)
        /// can be avoided.
        /// </summary>
        /// <param name="kernel">The kernel to generate a launcher for.</param>
        /// <param name="customGroupSize">The custom group size for the launching operation.</param>
        /// <returns>The generated launcher method.</returns>
        private MethodInfo GenerateKernelLauncherMethod(ILCompiledKernel kernel, int customGroupSize)
        {
            var entryPoint = kernel.EntryPoint;

            AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint);

            var launcher = entryPoint.CreateLauncherMethod(Context);
            var emitter  = new ILEmitter(launcher.ILGenerator);

            var cpuKernel = emitter.DeclareLocal(typeof(CPUKernel));

            KernelLauncherBuilder.EmitLoadKernelArgument <CPUKernel, ILEmitter>(
                Kernel.KernelInstanceParamIdx, emitter);
            emitter.Emit(LocalOperation.Store, cpuKernel);

            // Create an instance of the custom task type
            var task = emitter.DeclareLocal(kernel.TaskType);
            {
                var sharedMemSize = KernelLauncherBuilder.EmitSharedMemorySizeComputation(entryPoint, emitter);

                emitter.Emit(LocalOperation.Load, cpuKernel);
                emitter.EmitCall(
                    typeof(CPUKernel).GetProperty(
                        nameof(CPUKernel.KernelExecutionDelegate),
                        BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance).GetGetMethod(true));

                // Load custom user dimension
                KernelLauncherBuilder.EmitLoadDimensions(
                    entryPoint,
                    emitter,
                    Kernel.KernelParamDimensionIdx,
                    () => emitter.EmitNewObject(
                        typeof(Index3).GetConstructor(
                            new Type[] { typeof(int), typeof(int), typeof(int) })));

                // Load dimensions as index3 arguments
                KernelLauncherBuilder.EmitLoadDimensions(
                    entryPoint,
                    emitter,
                    Kernel.KernelParamDimensionIdx,
                    () => emitter.EmitNewObject(
                        typeof(Index3).GetConstructor(
                            new Type[] { typeof(int), typeof(int), typeof(int) })),
                    customGroupSize);

                // Load shared-memory size
                emitter.Emit(LocalOperation.Load, sharedMemSize);

                // Create new task object
                emitter.EmitNewObject(kernel.TaskConstructor);

                // Store task
                emitter.Emit(LocalOperation.Store, task);
            }

            // Assign parameters
            var parameters = entryPoint.Parameters;

            for (int i = 0, e = parameters.NumParameters; i < e; ++i)
            {
                emitter.Emit(LocalOperation.Load, task);
                emitter.Emit(ArgumentOperation.Load, i + Kernel.KernelParameterOffset);
                if (parameters.IsByRef(i))
                {
                    emitter.Emit(OpCodes.Ldobj, parameters[i]);
                }
                emitter.Emit(OpCodes.Stfld, kernel.TaskArgumentMapping[i]);
            }

            // Launch task: ((CPUKernel)kernel).CPUAccelerator.Launch(task);
            emitter.Emit(LocalOperation.Load, cpuKernel);
            emitter.EmitCall(
                typeof(CPUKernel).GetProperty(
                    nameof(CPUKernel.CPUAccelerator)).GetGetMethod(false));
            emitter.Emit(LocalOperation.Load, task);
            emitter.EmitCall(
                typeof(CPUAccelerator).GetMethod(
                    nameof(CPUAccelerator.Launch),
                    BindingFlags.NonPublic | BindingFlags.Instance));

            // End of launch method
            emitter.Emit(OpCodes.Ret);
            emitter.Finish();

            return(launcher.Finish());
        }
Example #3
0
        /// <summary cref="KernelAccelerator{TCompiledKernel, TKernel}
        /// .GenerateKernelLauncherMethod(TCompiledKernel, int)"/>
        protected override MethodInfo GenerateKernelLauncherMethod(
            CLCompiledKernel kernel,
            int customGroupSize)
        {
            var entryPoint = kernel.EntryPoint;

            AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint);

            // Add support for by ref parameters
            if (entryPoint.HasByRefParameters)
            {
                throw new NotSupportedException(
                          ErrorMessages.NotSupportedByRefKernelParameters);
            }

            using var scopedLock = entryPoint.CreateLauncherMethod(
                      Context.RuntimeSystem,
                      out var launcher);
            var emitter = new ILEmitter(launcher.ILGenerator);

            // Load kernel instance
            var kernelLocal = emitter.DeclareLocal(typeof(CLKernel));

            KernelLauncherBuilder.EmitLoadKernelArgument <CLKernel, ILEmitter>(
                Kernel.KernelInstanceParamIdx,
                emitter);
            emitter.Emit(LocalOperation.Store, kernelLocal);

            // Map all kernel arguments
            var argumentMapper = Backend.ArgumentMapper;

            argumentMapper.Map(
                emitter,
                kernelLocal,
                Context.TypeContext,
                entryPoint);

            // Load current driver API
            emitter.EmitCall(GetCLAPIMethod);

            // Load stream
            KernelLauncherBuilder.EmitLoadAcceleratorStream <CLStream, ILEmitter>(
                Kernel.KernelStreamParamIdx,
                emitter);

            // Load kernel
            emitter.Emit(LocalOperation.Load, kernelLocal);

            // Load dimensions
            KernelLauncherBuilder.EmitLoadRuntimeKernelConfig(
                entryPoint,
                emitter,
                Kernel.KernelParamDimensionIdx,
                MaxGridSize,
                MaxGroupSize,
                customGroupSize);

            // Dispatch kernel
            var launchMethod = GenericLaunchKernelMethod.MakeGenericMethod(
                entryPoint.SharedMemory.HasDynamicMemory
                ? typeof(DynamicSharedMemoryHandler)
                : typeof(DefaultLaunchHandler));

            emitter.EmitCall(launchMethod);

            // Emit ThrowIfFailed
            emitter.EmitCall(ThrowIfFailedMethod);

            emitter.Emit(OpCodes.Ret);
            emitter.Finish();

            return(launcher.Finish());
        }
Example #4
0
        /// <summary>
        /// Generates a dynamic kernel-launcher method that will be just-in-time compiled
        /// during the first invocation. Using the generated launcher lowers the overhead
        /// for kernel launching dramatically, since unnecessary operations (like boxing)
        /// can be avoided.
        /// </summary>
        /// <param name="kernel">The kernel to generate a launcher for.</param>
        /// <param name="customGroupSize">
        /// The custom group size for the launching operation.
        /// </param>
        /// <returns>The generated launcher method.</returns>
        private MethodInfo GenerateKernelLauncherMethod(
            ILCompiledKernel kernel,
            int customGroupSize)
        {
            var entryPoint = kernel.EntryPoint;

            AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint);

            // Add support for by ref parameters
            if (entryPoint.HasByRefParameters)
            {
                throw new NotSupportedException(
                          ErrorMessages.NotSupportedByRefKernelParameters);
            }

            using var scopedLock = entryPoint.CreateLauncherMethod(
                      Context.RuntimeSystem,
                      out var launcher);
            var emitter = new ILEmitter(launcher.ILGenerator);

            // Pretend to map kernel arguments (like a GPU accelerator would perform).
            var argumentMapper = Backend.ArgumentMapper;

            argumentMapper.Map(entryPoint);

            var cpuKernel = emitter.DeclareLocal(typeof(CPUKernel));

            KernelLauncherBuilder.EmitLoadKernelArgument <CPUKernel, ILEmitter>(
                Kernel.KernelInstanceParamIdx, emitter);
            emitter.Emit(LocalOperation.Store, cpuKernel);

            // Create an instance of the custom task type
            var task = emitter.DeclareLocal(kernel.TaskType);
            {
                emitter.Emit(LocalOperation.Load, cpuKernel);
                emitter.EmitCall(CPUKernel.GetKernelExecutionDelegate);

                // Load custom user dimension
                KernelLauncherBuilder.EmitLoadKernelConfig(
                    entryPoint,
                    emitter,
                    Kernel.KernelParamDimensionIdx,
                    MaxGridSize,
                    MaxGroupSize);

                // Load dimensions
                KernelLauncherBuilder.EmitLoadRuntimeKernelConfig(
                    entryPoint,
                    emitter,
                    Kernel.KernelParamDimensionIdx,
                    MaxGridSize,
                    MaxGroupSize,
                    customGroupSize);

                // Create new task object
                emitter.EmitNewObject(kernel.TaskConstructor);

                // Store task
                emitter.Emit(LocalOperation.Store, task);
            }

            // Assign parameters
            var parameters = entryPoint.Parameters;

            for (int i = 0, e = parameters.Count; i < e; ++i)
            {
                emitter.Emit(LocalOperation.Load, task);
                emitter.Emit(ArgumentOperation.Load, i + Kernel.KernelParameterOffset);
                if (parameters.IsByRef(i))
                {
                    emitter.Emit(OpCodes.Ldobj, parameters[i]);
                }
                emitter.Emit(OpCodes.Stfld, kernel.TaskArgumentMapping[i]);
            }

            // Launch task: ((CPUKernel)kernel).CPUAccelerator.Launch(task);
            emitter.Emit(LocalOperation.Load, cpuKernel);
            emitter.EmitCall(
                typeof(CPUKernel).GetProperty(
                    nameof(CPUKernel.CPUAccelerator)).GetGetMethod(false));
            emitter.Emit(LocalOperation.Load, task);
            emitter.EmitCall(
                typeof(CPUAccelerator).GetMethod(
                    nameof(CPUAccelerator.Launch),
                    BindingFlags.NonPublic | BindingFlags.Instance));

            // End of launch method
            emitter.Emit(OpCodes.Ret);
            emitter.Finish();

            return(launcher.Finish());
        }
Example #5
0
        /// <summary>
        /// Generates specialized task classes for kernel execution.
        /// </summary>
        /// <param name="kernel">The kernel.</param>
        /// <param name="taskType">The created task.</param>
        /// <param name="taskArgumentMapping">The created task-argument mapping that maps parameter indices of uniforms
        /// and dynamically-sized shared-memory-variable-length specifications to fields in the task class.</param>
        private static MethodInfo GenerateKernelExecutionMethod(
            CompiledKernel kernel,
            Type taskType,
            FieldInfo[] taskArgumentMapping)
        {
            var entryPoint         = kernel.EntryPoint;
            var ungroupedIndexType = entryPoint.UngroupedIndexType;

            // Build execute method
            var execute = new DynamicMethod(
                $"Execute_{kernel.EntryName}",
                typeof(void),
                CPUAcceleratorTask.ExecuteParameterTypes,
                taskType,
                true);

            // Build execute body
            var ilGenerator = execute.GetILGenerator();

            // Cast generic task type to actual task type
            var task = ilGenerator.DeclareLocal(taskType);

            ilGenerator.Emit(OpCodes.Ldarg_0);
            ilGenerator.Emit(OpCodes.Castclass, taskType);
            ilGenerator.Emit(OpCodes.Stloc, task);

            // Determine used grid dimensions
            var gridDim      = ilGenerator.DeclareLocal(ungroupedIndexType);
            var groupDimSize = ilGenerator.DeclareLocal(typeof(int));
            var groupDim     = ilGenerator.DeclareLocal(ungroupedIndexType);
            {
                var getGridDimFromTask = typeof(CPUAcceleratorTask).GetProperty(
                    nameof(CPUAcceleratorTask.UserGridDim)).GetGetMethod(false);

                KernelLauncherBuilder.EmitConvertIndex3ToTargetType(
                    ungroupedIndexType, ilGenerator, () =>
                {
                    ilGenerator.Emit(OpCodes.Ldarg_0);
                    ilGenerator.Emit(OpCodes.Call, getGridDimFromTask);
                });
                ilGenerator.Emit(OpCodes.Stloc, gridDim);

                var getGroupDimFromTask = typeof(CPUAcceleratorTask).GetProperty(
                    nameof(CPUAcceleratorTask.GroupDim)).GetGetMethod(false);

                KernelLauncherBuilder.EmitConvertIndex3ToTargetType(
                    ungroupedIndexType, ilGenerator, () =>
                {
                    ilGenerator.Emit(OpCodes.Ldloc, task);
                    ilGenerator.Emit(OpCodes.Call, getGroupDimFromTask);
                });
                ilGenerator.Emit(OpCodes.Stloc, groupDim);

                // Compute linear group-dim size
                ilGenerator.Emit(OpCodes.Ldloca, groupDim);
                ilGenerator.Emit(
                    OpCodes.Call,
                    ungroupedIndexType.GetProperty(nameof(IIndex.Size)).GetGetMethod());
                ilGenerator.Emit(OpCodes.Stloc, groupDimSize);
            }

            // Cache all fields in local variables
            var taskArgumentLocals  = new LocalBuilder[taskArgumentMapping.Length];
            var numUniformVariables = entryPoint.NumUniformVariables;

            Debug.Assert(numUniformVariables <= taskArgumentLocals.Length);

            for (int i = 0, e = taskArgumentLocals.Length; i < e; ++i)
            {
                // Declare local
                taskArgumentLocals[i] = ilGenerator.DeclareLocal(
                    taskArgumentMapping[i].FieldType);

                // Load instance field i
                ilGenerator.Emit(OpCodes.Ldloc, task);
                ilGenerator.Emit(OpCodes.Ldfld, taskArgumentMapping[i]);

                // Cache field value in local variable
                ilGenerator.Emit(OpCodes.Stloc, taskArgumentLocals[i]);
            }

            // Cache types of shared-memory variable
            var sharedMemVariables = entryPoint.SharedMemoryVariables;

            // Initialize sharedMemOffset to 0
            var sharedMemOffset = ilGenerator.DeclareLocal(typeof(int));

            ilGenerator.Emit(OpCodes.Ldc_I4_0);
            ilGenerator.Emit(OpCodes.Stloc, sharedMemOffset);

            var sharedMemoryLocals        = new LocalBuilder[sharedMemVariables.Length];
            int dynamicallySizedLengthIdx = 0;

            for (int i = 0, e = sharedMemVariables.Length; i < e; ++i)
            {
                var sharedMemVariable = sharedMemVariables[i];
                sharedMemoryLocals[i] = ilGenerator.DeclareLocal(sharedMemVariable.Type);
                var lengthLocal = ilGenerator.DeclareLocal(typeof(int));

                // The length of dynamically-sized shared-memory variables has to be loaded
                // from the provided length fields
                if (sharedMemVariable.IsDynamicallySizedArray)
                {
                    ilGenerator.Emit(OpCodes.Ldloc, taskArgumentLocals[numUniformVariables + dynamicallySizedLengthIdx++]);
                    ilGenerator.Emit(OpCodes.Stloc, lengthLocal);
                }
                else
                {
                    ilGenerator.Emit(OpCodes.Ldc_I4, sharedMemVariable.Size);
                    ilGenerator.Emit(OpCodes.Stloc, lengthLocal);
                }

                // Load the shared-memory-view from the parameter with index 1
                ilGenerator.Emit(OpCodes.Ldarga, 2);

                // Load offset & length
                ilGenerator.Emit(OpCodes.Ldloc, sharedMemOffset);
                KernelLauncherBuilder.EmitLoadIndex(ilGenerator);
                ilGenerator.Emit(OpCodes.Ldloc, lengthLocal);
                KernelLauncherBuilder.EmitLoadIndex(ilGenerator);

                // var tView = sharedMemory.GetSubView(offset, length)
                ilGenerator.Emit(OpCodes.Call, typeof(ArrayView <byte>).GetMethod(nameof(ArrayView <byte> .GetSubView),
                                                                                  new Type[] { typeof(Index), typeof(Index) }));

                // local = tView.BitCast<TargetType>
                // or
                // local = tView.BitCast<TargetType>().GetVariableView();

                var castLocal = ilGenerator.DeclareLocal(typeof(ArrayView <byte>));
                ilGenerator.Emit(OpCodes.Stloc, castLocal);
                ilGenerator.Emit(OpCodes.Ldloca, castLocal);
                ilGenerator.Emit(OpCodes.Call,
                                 typeof(ArrayView <byte>).GetMethod(nameof(ArrayView <byte> .Cast)).MakeGenericMethod(
                                     sharedMemVariable.ElementType));

                if (!sharedMemVariable.IsArray)
                {
                    var genericArrayViewType = typeof(ArrayView <>).MakeGenericType(
                        sharedMemVariable.ElementType);
                    var castedLocal = ilGenerator.DeclareLocal(genericArrayViewType);
                    ilGenerator.Emit(OpCodes.Stloc, castedLocal);
                    ilGenerator.Emit(OpCodes.Ldloca, castedLocal);

                    ilGenerator.Emit(OpCodes.Call,
                                     genericArrayViewType.GetMethod(
                                         nameof(ArrayView <byte> .GetVariableView),
                                         new Type[] { }));
                }

                // Store shared-memory view to local variable
                ilGenerator.Emit(OpCodes.Stloc, sharedMemoryLocals[i]);

                // Add the current size to the memory offset
                ilGenerator.Emit(OpCodes.Ldloc, sharedMemOffset);
                ilGenerator.Emit(OpCodes.Ldloc, lengthLocal);
                ilGenerator.Emit(OpCodes.Add);
                ilGenerator.Emit(OpCodes.Stloc, sharedMemOffset);
            }

            // Build loop to address all dispatched grid indices
            var loopHeader = ilGenerator.DefineLabel();
            var loopBody   = ilGenerator.DefineLabel();

            // Init counter: int i = WarpSize * runtimeWarpId + runtimeWarpThreadIdx;
            // => int i = runtimeWarpOffset + runtimeWarpThreadIdx
            // => int i = runtimeThreadOffset
            var chunkIdxCounter = ilGenerator.DeclareLocal(typeof(int));
            var breakCondition  = ilGenerator.DeclareLocal(typeof(bool));

            ilGenerator.Emit(OpCodes.Ldarg, 3);
            ilGenerator.Emit(OpCodes.Stloc, chunkIdxCounter);
            ilGenerator.Emit(OpCodes.Br, loopHeader);

            var globalIndex = ilGenerator.DeclareLocal(typeof(int));

            // Loop body
            {
                ilGenerator.MarkLabel(loopBody);

                // var index = i + chunkOffset;
                ilGenerator.Emit(OpCodes.Ldloc, chunkIdxCounter);
                ilGenerator.Emit(OpCodes.Ldarg, 8);
                ilGenerator.Emit(OpCodes.Add);

                ilGenerator.Emit(OpCodes.Stloc, globalIndex);
            }

            // Check the custom user dimension
            // globalIndex < targetDimension
            var kernelNotInvoked = ilGenerator.DefineLabel();

            ilGenerator.Emit(OpCodes.Ldloc, globalIndex);
            ilGenerator.Emit(OpCodes.Ldarg, 9);
            ilGenerator.Emit(OpCodes.Clt);
            ilGenerator.Emit(OpCodes.Stloc, breakCondition);
            ilGenerator.Emit(OpCodes.Ldloc, breakCondition);
            ilGenerator.Emit(OpCodes.Brfalse, kernelNotInvoked);

            // Launch the actual kernel method
            {
                // Construct launch index from linear index
                ilGenerator.Emit(OpCodes.Ldloc, globalIndex);
                if (!entryPoint.IsGroupedIndexEntry)
                {
                    // Use direct construction for 1D index
                    KernelLauncherBuilder.EmitConvertFrom1DIndexToTargetIndexType(
                        ungroupedIndexType,
                        ilGenerator,
                        () => ilGenerator.Emit(OpCodes.Ldloc, gridDim));
                }
                else
                {
                    // We have to split grid and group indices for GroupedIndex-reconstruction
                    var linearIdx = ilGenerator.DeclareLocal(typeof(int));
                    ilGenerator.Emit(OpCodes.Stloc, linearIdx);

                    // Compute grid index
                    ilGenerator.Emit(OpCodes.Ldloc, linearIdx);
                    ilGenerator.Emit(OpCodes.Ldloc, groupDimSize);
                    ilGenerator.Emit(OpCodes.Div);
                    KernelLauncherBuilder.EmitConvertFrom1DIndexToTargetIndexType(
                        ungroupedIndexType,
                        ilGenerator,
                        () => ilGenerator.Emit(OpCodes.Ldloc, gridDim));

                    // Compute group index
                    ilGenerator.Emit(OpCodes.Ldloc, linearIdx);
                    ilGenerator.Emit(OpCodes.Ldloc, groupDimSize);
                    ilGenerator.Emit(OpCodes.Rem);
                    KernelLauncherBuilder.EmitConvertFrom1DIndexToTargetIndexType(
                        ungroupedIndexType,
                        ilGenerator,
                        () => ilGenerator.Emit(OpCodes.Ldloc, groupDim));

                    var groupedConstructor = entryPoint.KernelIndexType.GetConstructor(
                        new Type[] { ungroupedIndexType, ungroupedIndexType });
                    ilGenerator.Emit(OpCodes.Newobj, groupedConstructor);
                }

                // Load kernel arguments
                var variableReferences = new LocalBuilder[entryPoint.NumCustomParameters];
                for (int i = 0; i < numUniformVariables; ++i)
                {
                    variableReferences[entryPoint.UniformVariables[i].Index - 1] = taskArgumentLocals[i];
                }
                for (int i = 0, e = sharedMemoryLocals.Length; i < e; ++i)
                {
                    variableReferences[entryPoint.SharedMemoryVariables[i].Index - 1] = sharedMemoryLocals[i];
                }

                // Load kernel arguments
                foreach (var variableRef in variableReferences)
                {
                    Debug.Assert(variableRef != null, "Invalid kernel argument");
                    ilGenerator.Emit(OpCodes.Ldloc, variableRef);
                }

                // Invoke kernel
                ilGenerator.Emit(OpCodes.Call, entryPoint.MethodInfo);
            }

            // Synchronize group threads
            {
                ilGenerator.MarkLabel(kernelNotInvoked);

                // Memory barrier for interlocked calls
                ilGenerator.Emit(
                    OpCodes.Call,
                    typeof(Thread).GetMethod(
                        nameof(Thread.MemoryBarrier),
                        BindingFlags.Public | BindingFlags.Static));

                // Wait for other group threads
                ilGenerator.Emit(OpCodes.Ldarg, 1);
                ilGenerator.Emit(
                    OpCodes.Call,
                    typeof(Barrier).GetMethod(
                        nameof(Barrier.SignalAndWait),
                        BindingFlags.Public | BindingFlags.Instance,
                        null,
                        new Type[] { },
                        null));
            }

            // Increase counter
            {
                // i += groupSize
                ilGenerator.Emit(OpCodes.Ldloc, chunkIdxCounter);
                ilGenerator.Emit(OpCodes.Ldarg, 4);
                ilGenerator.Emit(OpCodes.Add);
                ilGenerator.Emit(OpCodes.Stloc, chunkIdxCounter);
            }

            // Loop header
            {
                ilGenerator.MarkLabel(loopHeader);

                // if (i < chunkSize) ...
                ilGenerator.Emit(OpCodes.Ldloc, chunkIdxCounter);
                ilGenerator.Emit(OpCodes.Ldarg, 7);
                ilGenerator.Emit(OpCodes.Clt);
                ilGenerator.Emit(OpCodes.Stloc, breakCondition);
                ilGenerator.Emit(OpCodes.Ldloc, breakCondition);
                ilGenerator.Emit(OpCodes.Brtrue, loopBody);
            }

            // Emit final return
            ilGenerator.Emit(OpCodes.Ret);

            return(execute);
        }
Example #6
0
        /// <summary>
        /// Generates a dynamic kernel-launcher method that will be just-in-time compiled
        /// during the first invocation. Using the generated launcher lowers the overhead
        /// for kernel launching dramatically, since unnecessary operations (like boxing)
        /// can be avoided.
        /// </summary>
        /// <param name="kernel">The kernel to generate a launcher for.</param>
        /// <param name="taskType">The created task.</param>
        /// <param name="taskArgumentMapping">The created task-argument mapping that maps parameter indices of uniforms</param>
        /// <param name="customGroupSize">The custom group size for the launching operation.</param>
        /// <returns>The generated launcher method.</returns>
        private MethodInfo GenerateKernelLauncherMethod(
            CompiledKernel kernel,
            out Type taskType,
            out FieldInfo[] taskArgumentMapping,
            int customGroupSize)
        {
            var entryPoint = kernel.EntryPoint;

            AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint);

            var uniformVariables    = entryPoint.UniformVariables;
            var numUniformVariables = uniformVariables.Length;

            var kernelParamTypes = entryPoint.CreateCustomParameterTypes();
            int numKernelParams  = kernelParamTypes.Length;
            var funcParamTypes   = new Type[numKernelParams + Kernel.KernelParameterOffset];

            GenerateAcceleratorTask(
                kernel,
                kernelParamTypes,
                out taskType,
                out ConstructorInfo taskConstructor,
                out taskArgumentMapping);

            // Launcher(Kernel, AcceleratorStream, [Index], ...)
            funcParamTypes[Kernel.KernelInstanceParamIdx]  = typeof(Kernel);
            funcParamTypes[Kernel.KernelStreamParamIdx]    = typeof(AcceleratorStream);
            funcParamTypes[Kernel.KernelParamDimensionIdx] = entryPoint.KernelIndexType;
            kernelParamTypes.CopyTo(funcParamTypes, Kernel.KernelParameterOffset);

            // Create the actual launcher method
            var func = new DynamicMethod(
                kernel.EntryName,
                typeof(void),
                funcParamTypes,
                typeof(Kernel));
            var funcParams  = func.GetParameters();
            var ilGenerator = func.GetILGenerator();

            var cpuKernel = ilGenerator.DeclareLocal(typeof(CPUKernel));

            KernelLauncherBuilder.EmitLoadKernelArgument <CPUKernel>(Kernel.KernelInstanceParamIdx, ilGenerator);
            ilGenerator.Emit(OpCodes.Stloc, cpuKernel);

            // Create an instance of the custom task type
            var task = ilGenerator.DeclareLocal(taskType);

            {
                var sharedMemSize = KernelLauncherBuilder.EmitSharedMemorySizeComputation(
                    entryPoint,
                    ilGenerator,
                    paramIdx => funcParams[paramIdx + Kernel.KernelParameterOffset]);

                ilGenerator.Emit(OpCodes.Ldloc, cpuKernel);
                ilGenerator.Emit(
                    OpCodes.Call,
                    typeof(CPUKernel).GetProperty(
                        nameof(CPUKernel.KernelExecutionDelegate),
                        BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance).GetGetMethod(true));

                // Load custom user dimension
                KernelLauncherBuilder.EmitLoadDimensions(
                    entryPoint,
                    ilGenerator,
                    Kernel.KernelParamDimensionIdx,
                    () => ilGenerator.Emit(
                        OpCodes.Newobj,
                        typeof(Index3).GetConstructor(
                            new Type[] { typeof(int), typeof(int), typeof(int) })));

                // Load dimensions as index3 arguments
                KernelLauncherBuilder.EmitLoadDimensions(
                    entryPoint,
                    ilGenerator,
                    Kernel.KernelParamDimensionIdx,
                    () => ilGenerator.Emit(
                        OpCodes.Newobj,
                        typeof(Index3).GetConstructor(
                            new Type[] { typeof(int), typeof(int), typeof(int) })),
                    customGroupSize);

                // Load shared-memory size
                ilGenerator.Emit(OpCodes.Ldloc, sharedMemSize);

                // Create new task object
                ilGenerator.Emit(OpCodes.Newobj, taskConstructor);

                // Store task
                ilGenerator.Emit(OpCodes.Stloc, task);
            }

            // Assign parameters
            for (int i = 0; i < numUniformVariables; ++i)
            {
                ilGenerator.Emit(OpCodes.Ldloc, task);
                ilGenerator.Emit(OpCodes.Ldarg, i + Kernel.KernelParameterOffset);
                ilGenerator.Emit(OpCodes.Stfld, taskArgumentMapping[i]);
            }

            // Launch task: ((CPUKernel)kernel).CPUAccelerator.Launch(task);
            ilGenerator.Emit(OpCodes.Ldloc, cpuKernel);
            ilGenerator.Emit(
                OpCodes.Call,
                typeof(CPUKernel).GetProperty(
                    nameof(CPUKernel.CPUAccelerator)).GetGetMethod(false));
            ilGenerator.Emit(OpCodes.Ldloc, task);
            ilGenerator.Emit(
                OpCodes.Call,
                typeof(CPUAccelerator).GetMethod(
                    nameof(CPUAccelerator.Launch),
                    BindingFlags.NonPublic | BindingFlags.Instance));

            // End of launch method
            ilGenerator.Emit(OpCodes.Ret);

            return(func);
        }