/// <summary cref="KernelAccelerator{TCompiledKernel, TKernel} /// .GenerateKernelLauncherMethod(TCompiledKernel, int)"/> protected override MethodInfo GenerateKernelLauncherMethod( CLCompiledKernel kernel, int customGroupSize) { var entryPoint = kernel.EntryPoint; AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint); // Add support for by ref parameters if (entryPoint.HasByRefParameters) { throw new NotSupportedException( ErrorMessages.NotSupportedByRefKernelParameters); } var launcher = entryPoint.CreateLauncherMethod(Context); var emitter = new ILEmitter(launcher.ILGenerator); // Load kernel instance var kernelLocal = emitter.DeclareLocal(typeof(CLKernel)); KernelLauncherBuilder.EmitLoadKernelArgument <CLKernel, ILEmitter>( Kernel.KernelInstanceParamIdx, emitter); emitter.Emit(LocalOperation.Store, kernelLocal); // Map all kernel arguments var argumentMapper = Backend.ArgumentMapper; argumentMapper.Map(emitter, kernelLocal, entryPoint); // Load stream KernelLauncherBuilder.EmitLoadAcceleratorStream <CLStream, ILEmitter>( Kernel.KernelStreamParamIdx, emitter); // Load kernel emitter.Emit(LocalOperation.Load, kernelLocal); // Load dimensions KernelLauncherBuilder.EmitLoadRuntimeKernelConfig( entryPoint, emitter, Kernel.KernelParamDimensionIdx, customGroupSize); // Dispatch kernel emitter.EmitCall(LaunchKernelMethod); // Emit ThrowIfFailed emitter.EmitCall(ThrowIfFailedMethod); emitter.Emit(OpCodes.Ret); emitter.Finish(); return(launcher.Finish()); }
/// <summary> /// Generates a dynamic kernel-launcher method that will be just-in-time compiled /// during the first invocation. Using the generated launcher lowers the overhead /// for kernel launching dramatically, since unnecessary operations (like boxing) /// can be avoided. /// </summary> /// <param name="kernel">The kernel to generate a launcher for.</param> /// <param name="customGroupSize">The custom group size for the launching operation.</param> /// <returns>The generated launcher method.</returns> private MethodInfo GenerateKernelLauncherMethod(ILCompiledKernel kernel, int customGroupSize) { var entryPoint = kernel.EntryPoint; AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint); var launcher = entryPoint.CreateLauncherMethod(Context); var emitter = new ILEmitter(launcher.ILGenerator); var cpuKernel = emitter.DeclareLocal(typeof(CPUKernel)); KernelLauncherBuilder.EmitLoadKernelArgument <CPUKernel, ILEmitter>( Kernel.KernelInstanceParamIdx, emitter); emitter.Emit(LocalOperation.Store, cpuKernel); // Create an instance of the custom task type var task = emitter.DeclareLocal(kernel.TaskType); { var sharedMemSize = KernelLauncherBuilder.EmitSharedMemorySizeComputation(entryPoint, emitter); emitter.Emit(LocalOperation.Load, cpuKernel); emitter.EmitCall( typeof(CPUKernel).GetProperty( nameof(CPUKernel.KernelExecutionDelegate), BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance).GetGetMethod(true)); // Load custom user dimension KernelLauncherBuilder.EmitLoadDimensions( entryPoint, emitter, Kernel.KernelParamDimensionIdx, () => emitter.EmitNewObject( typeof(Index3).GetConstructor( new Type[] { typeof(int), typeof(int), typeof(int) }))); // Load dimensions as index3 arguments KernelLauncherBuilder.EmitLoadDimensions( entryPoint, emitter, Kernel.KernelParamDimensionIdx, () => emitter.EmitNewObject( typeof(Index3).GetConstructor( new Type[] { typeof(int), typeof(int), typeof(int) })), customGroupSize); // Load shared-memory size emitter.Emit(LocalOperation.Load, sharedMemSize); // Create new task object emitter.EmitNewObject(kernel.TaskConstructor); // Store task emitter.Emit(LocalOperation.Store, task); } // Assign parameters var parameters = entryPoint.Parameters; for (int i = 0, e = parameters.NumParameters; i < e; ++i) { emitter.Emit(LocalOperation.Load, task); emitter.Emit(ArgumentOperation.Load, i + Kernel.KernelParameterOffset); if (parameters.IsByRef(i)) { emitter.Emit(OpCodes.Ldobj, parameters[i]); } emitter.Emit(OpCodes.Stfld, kernel.TaskArgumentMapping[i]); } // Launch task: ((CPUKernel)kernel).CPUAccelerator.Launch(task); emitter.Emit(LocalOperation.Load, cpuKernel); emitter.EmitCall( typeof(CPUKernel).GetProperty( nameof(CPUKernel.CPUAccelerator)).GetGetMethod(false)); emitter.Emit(LocalOperation.Load, task); emitter.EmitCall( typeof(CPUAccelerator).GetMethod( nameof(CPUAccelerator.Launch), BindingFlags.NonPublic | BindingFlags.Instance)); // End of launch method emitter.Emit(OpCodes.Ret); emitter.Finish(); return(launcher.Finish()); }
/// <summary cref="KernelAccelerator{TCompiledKernel, TKernel} /// .GenerateKernelLauncherMethod(TCompiledKernel, int)"/> protected override MethodInfo GenerateKernelLauncherMethod( CLCompiledKernel kernel, int customGroupSize) { var entryPoint = kernel.EntryPoint; AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint); // Add support for by ref parameters if (entryPoint.HasByRefParameters) { throw new NotSupportedException( ErrorMessages.NotSupportedByRefKernelParameters); } using var scopedLock = entryPoint.CreateLauncherMethod( Context.RuntimeSystem, out var launcher); var emitter = new ILEmitter(launcher.ILGenerator); // Load kernel instance var kernelLocal = emitter.DeclareLocal(typeof(CLKernel)); KernelLauncherBuilder.EmitLoadKernelArgument <CLKernel, ILEmitter>( Kernel.KernelInstanceParamIdx, emitter); emitter.Emit(LocalOperation.Store, kernelLocal); // Map all kernel arguments var argumentMapper = Backend.ArgumentMapper; argumentMapper.Map( emitter, kernelLocal, Context.TypeContext, entryPoint); // Load current driver API emitter.EmitCall(GetCLAPIMethod); // Load stream KernelLauncherBuilder.EmitLoadAcceleratorStream <CLStream, ILEmitter>( Kernel.KernelStreamParamIdx, emitter); // Load kernel emitter.Emit(LocalOperation.Load, kernelLocal); // Load dimensions KernelLauncherBuilder.EmitLoadRuntimeKernelConfig( entryPoint, emitter, Kernel.KernelParamDimensionIdx, MaxGridSize, MaxGroupSize, customGroupSize); // Dispatch kernel var launchMethod = GenericLaunchKernelMethod.MakeGenericMethod( entryPoint.SharedMemory.HasDynamicMemory ? typeof(DynamicSharedMemoryHandler) : typeof(DefaultLaunchHandler)); emitter.EmitCall(launchMethod); // Emit ThrowIfFailed emitter.EmitCall(ThrowIfFailedMethod); emitter.Emit(OpCodes.Ret); emitter.Finish(); return(launcher.Finish()); }
/// <summary> /// Generates a dynamic kernel-launcher method that will be just-in-time compiled /// during the first invocation. Using the generated launcher lowers the overhead /// for kernel launching dramatically, since unnecessary operations (like boxing) /// can be avoided. /// </summary> /// <param name="kernel">The kernel to generate a launcher for.</param> /// <param name="customGroupSize"> /// The custom group size for the launching operation. /// </param> /// <returns>The generated launcher method.</returns> private MethodInfo GenerateKernelLauncherMethod( ILCompiledKernel kernel, int customGroupSize) { var entryPoint = kernel.EntryPoint; AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint); // Add support for by ref parameters if (entryPoint.HasByRefParameters) { throw new NotSupportedException( ErrorMessages.NotSupportedByRefKernelParameters); } using var scopedLock = entryPoint.CreateLauncherMethod( Context.RuntimeSystem, out var launcher); var emitter = new ILEmitter(launcher.ILGenerator); // Pretend to map kernel arguments (like a GPU accelerator would perform). var argumentMapper = Backend.ArgumentMapper; argumentMapper.Map(entryPoint); var cpuKernel = emitter.DeclareLocal(typeof(CPUKernel)); KernelLauncherBuilder.EmitLoadKernelArgument <CPUKernel, ILEmitter>( Kernel.KernelInstanceParamIdx, emitter); emitter.Emit(LocalOperation.Store, cpuKernel); // Create an instance of the custom task type var task = emitter.DeclareLocal(kernel.TaskType); { emitter.Emit(LocalOperation.Load, cpuKernel); emitter.EmitCall(CPUKernel.GetKernelExecutionDelegate); // Load custom user dimension KernelLauncherBuilder.EmitLoadKernelConfig( entryPoint, emitter, Kernel.KernelParamDimensionIdx, MaxGridSize, MaxGroupSize); // Load dimensions KernelLauncherBuilder.EmitLoadRuntimeKernelConfig( entryPoint, emitter, Kernel.KernelParamDimensionIdx, MaxGridSize, MaxGroupSize, customGroupSize); // Create new task object emitter.EmitNewObject(kernel.TaskConstructor); // Store task emitter.Emit(LocalOperation.Store, task); } // Assign parameters var parameters = entryPoint.Parameters; for (int i = 0, e = parameters.Count; i < e; ++i) { emitter.Emit(LocalOperation.Load, task); emitter.Emit(ArgumentOperation.Load, i + Kernel.KernelParameterOffset); if (parameters.IsByRef(i)) { emitter.Emit(OpCodes.Ldobj, parameters[i]); } emitter.Emit(OpCodes.Stfld, kernel.TaskArgumentMapping[i]); } // Launch task: ((CPUKernel)kernel).CPUAccelerator.Launch(task); emitter.Emit(LocalOperation.Load, cpuKernel); emitter.EmitCall( typeof(CPUKernel).GetProperty( nameof(CPUKernel.CPUAccelerator)).GetGetMethod(false)); emitter.Emit(LocalOperation.Load, task); emitter.EmitCall( typeof(CPUAccelerator).GetMethod( nameof(CPUAccelerator.Launch), BindingFlags.NonPublic | BindingFlags.Instance)); // End of launch method emitter.Emit(OpCodes.Ret); emitter.Finish(); return(launcher.Finish()); }
/// <summary> /// Generates specialized task classes for kernel execution. /// </summary> /// <param name="kernel">The kernel.</param> /// <param name="taskType">The created task.</param> /// <param name="taskArgumentMapping">The created task-argument mapping that maps parameter indices of uniforms /// and dynamically-sized shared-memory-variable-length specifications to fields in the task class.</param> private static MethodInfo GenerateKernelExecutionMethod( CompiledKernel kernel, Type taskType, FieldInfo[] taskArgumentMapping) { var entryPoint = kernel.EntryPoint; var ungroupedIndexType = entryPoint.UngroupedIndexType; // Build execute method var execute = new DynamicMethod( $"Execute_{kernel.EntryName}", typeof(void), CPUAcceleratorTask.ExecuteParameterTypes, taskType, true); // Build execute body var ilGenerator = execute.GetILGenerator(); // Cast generic task type to actual task type var task = ilGenerator.DeclareLocal(taskType); ilGenerator.Emit(OpCodes.Ldarg_0); ilGenerator.Emit(OpCodes.Castclass, taskType); ilGenerator.Emit(OpCodes.Stloc, task); // Determine used grid dimensions var gridDim = ilGenerator.DeclareLocal(ungroupedIndexType); var groupDimSize = ilGenerator.DeclareLocal(typeof(int)); var groupDim = ilGenerator.DeclareLocal(ungroupedIndexType); { var getGridDimFromTask = typeof(CPUAcceleratorTask).GetProperty( nameof(CPUAcceleratorTask.UserGridDim)).GetGetMethod(false); KernelLauncherBuilder.EmitConvertIndex3ToTargetType( ungroupedIndexType, ilGenerator, () => { ilGenerator.Emit(OpCodes.Ldarg_0); ilGenerator.Emit(OpCodes.Call, getGridDimFromTask); }); ilGenerator.Emit(OpCodes.Stloc, gridDim); var getGroupDimFromTask = typeof(CPUAcceleratorTask).GetProperty( nameof(CPUAcceleratorTask.GroupDim)).GetGetMethod(false); KernelLauncherBuilder.EmitConvertIndex3ToTargetType( ungroupedIndexType, ilGenerator, () => { ilGenerator.Emit(OpCodes.Ldloc, task); ilGenerator.Emit(OpCodes.Call, getGroupDimFromTask); }); ilGenerator.Emit(OpCodes.Stloc, groupDim); // Compute linear group-dim size ilGenerator.Emit(OpCodes.Ldloca, groupDim); ilGenerator.Emit( OpCodes.Call, ungroupedIndexType.GetProperty(nameof(IIndex.Size)).GetGetMethod()); ilGenerator.Emit(OpCodes.Stloc, groupDimSize); } // Cache all fields in local variables var taskArgumentLocals = new LocalBuilder[taskArgumentMapping.Length]; var numUniformVariables = entryPoint.NumUniformVariables; Debug.Assert(numUniformVariables <= taskArgumentLocals.Length); for (int i = 0, e = taskArgumentLocals.Length; i < e; ++i) { // Declare local taskArgumentLocals[i] = ilGenerator.DeclareLocal( taskArgumentMapping[i].FieldType); // Load instance field i ilGenerator.Emit(OpCodes.Ldloc, task); ilGenerator.Emit(OpCodes.Ldfld, taskArgumentMapping[i]); // Cache field value in local variable ilGenerator.Emit(OpCodes.Stloc, taskArgumentLocals[i]); } // Cache types of shared-memory variable var sharedMemVariables = entryPoint.SharedMemoryVariables; // Initialize sharedMemOffset to 0 var sharedMemOffset = ilGenerator.DeclareLocal(typeof(int)); ilGenerator.Emit(OpCodes.Ldc_I4_0); ilGenerator.Emit(OpCodes.Stloc, sharedMemOffset); var sharedMemoryLocals = new LocalBuilder[sharedMemVariables.Length]; int dynamicallySizedLengthIdx = 0; for (int i = 0, e = sharedMemVariables.Length; i < e; ++i) { var sharedMemVariable = sharedMemVariables[i]; sharedMemoryLocals[i] = ilGenerator.DeclareLocal(sharedMemVariable.Type); var lengthLocal = ilGenerator.DeclareLocal(typeof(int)); // The length of dynamically-sized shared-memory variables has to be loaded // from the provided length fields if (sharedMemVariable.IsDynamicallySizedArray) { ilGenerator.Emit(OpCodes.Ldloc, taskArgumentLocals[numUniformVariables + dynamicallySizedLengthIdx++]); ilGenerator.Emit(OpCodes.Stloc, lengthLocal); } else { ilGenerator.Emit(OpCodes.Ldc_I4, sharedMemVariable.Size); ilGenerator.Emit(OpCodes.Stloc, lengthLocal); } // Load the shared-memory-view from the parameter with index 1 ilGenerator.Emit(OpCodes.Ldarga, 2); // Load offset & length ilGenerator.Emit(OpCodes.Ldloc, sharedMemOffset); KernelLauncherBuilder.EmitLoadIndex(ilGenerator); ilGenerator.Emit(OpCodes.Ldloc, lengthLocal); KernelLauncherBuilder.EmitLoadIndex(ilGenerator); // var tView = sharedMemory.GetSubView(offset, length) ilGenerator.Emit(OpCodes.Call, typeof(ArrayView <byte>).GetMethod(nameof(ArrayView <byte> .GetSubView), new Type[] { typeof(Index), typeof(Index) })); // local = tView.BitCast<TargetType> // or // local = tView.BitCast<TargetType>().GetVariableView(); var castLocal = ilGenerator.DeclareLocal(typeof(ArrayView <byte>)); ilGenerator.Emit(OpCodes.Stloc, castLocal); ilGenerator.Emit(OpCodes.Ldloca, castLocal); ilGenerator.Emit(OpCodes.Call, typeof(ArrayView <byte>).GetMethod(nameof(ArrayView <byte> .Cast)).MakeGenericMethod( sharedMemVariable.ElementType)); if (!sharedMemVariable.IsArray) { var genericArrayViewType = typeof(ArrayView <>).MakeGenericType( sharedMemVariable.ElementType); var castedLocal = ilGenerator.DeclareLocal(genericArrayViewType); ilGenerator.Emit(OpCodes.Stloc, castedLocal); ilGenerator.Emit(OpCodes.Ldloca, castedLocal); ilGenerator.Emit(OpCodes.Call, genericArrayViewType.GetMethod( nameof(ArrayView <byte> .GetVariableView), new Type[] { })); } // Store shared-memory view to local variable ilGenerator.Emit(OpCodes.Stloc, sharedMemoryLocals[i]); // Add the current size to the memory offset ilGenerator.Emit(OpCodes.Ldloc, sharedMemOffset); ilGenerator.Emit(OpCodes.Ldloc, lengthLocal); ilGenerator.Emit(OpCodes.Add); ilGenerator.Emit(OpCodes.Stloc, sharedMemOffset); } // Build loop to address all dispatched grid indices var loopHeader = ilGenerator.DefineLabel(); var loopBody = ilGenerator.DefineLabel(); // Init counter: int i = WarpSize * runtimeWarpId + runtimeWarpThreadIdx; // => int i = runtimeWarpOffset + runtimeWarpThreadIdx // => int i = runtimeThreadOffset var chunkIdxCounter = ilGenerator.DeclareLocal(typeof(int)); var breakCondition = ilGenerator.DeclareLocal(typeof(bool)); ilGenerator.Emit(OpCodes.Ldarg, 3); ilGenerator.Emit(OpCodes.Stloc, chunkIdxCounter); ilGenerator.Emit(OpCodes.Br, loopHeader); var globalIndex = ilGenerator.DeclareLocal(typeof(int)); // Loop body { ilGenerator.MarkLabel(loopBody); // var index = i + chunkOffset; ilGenerator.Emit(OpCodes.Ldloc, chunkIdxCounter); ilGenerator.Emit(OpCodes.Ldarg, 8); ilGenerator.Emit(OpCodes.Add); ilGenerator.Emit(OpCodes.Stloc, globalIndex); } // Check the custom user dimension // globalIndex < targetDimension var kernelNotInvoked = ilGenerator.DefineLabel(); ilGenerator.Emit(OpCodes.Ldloc, globalIndex); ilGenerator.Emit(OpCodes.Ldarg, 9); ilGenerator.Emit(OpCodes.Clt); ilGenerator.Emit(OpCodes.Stloc, breakCondition); ilGenerator.Emit(OpCodes.Ldloc, breakCondition); ilGenerator.Emit(OpCodes.Brfalse, kernelNotInvoked); // Launch the actual kernel method { // Construct launch index from linear index ilGenerator.Emit(OpCodes.Ldloc, globalIndex); if (!entryPoint.IsGroupedIndexEntry) { // Use direct construction for 1D index KernelLauncherBuilder.EmitConvertFrom1DIndexToTargetIndexType( ungroupedIndexType, ilGenerator, () => ilGenerator.Emit(OpCodes.Ldloc, gridDim)); } else { // We have to split grid and group indices for GroupedIndex-reconstruction var linearIdx = ilGenerator.DeclareLocal(typeof(int)); ilGenerator.Emit(OpCodes.Stloc, linearIdx); // Compute grid index ilGenerator.Emit(OpCodes.Ldloc, linearIdx); ilGenerator.Emit(OpCodes.Ldloc, groupDimSize); ilGenerator.Emit(OpCodes.Div); KernelLauncherBuilder.EmitConvertFrom1DIndexToTargetIndexType( ungroupedIndexType, ilGenerator, () => ilGenerator.Emit(OpCodes.Ldloc, gridDim)); // Compute group index ilGenerator.Emit(OpCodes.Ldloc, linearIdx); ilGenerator.Emit(OpCodes.Ldloc, groupDimSize); ilGenerator.Emit(OpCodes.Rem); KernelLauncherBuilder.EmitConvertFrom1DIndexToTargetIndexType( ungroupedIndexType, ilGenerator, () => ilGenerator.Emit(OpCodes.Ldloc, groupDim)); var groupedConstructor = entryPoint.KernelIndexType.GetConstructor( new Type[] { ungroupedIndexType, ungroupedIndexType }); ilGenerator.Emit(OpCodes.Newobj, groupedConstructor); } // Load kernel arguments var variableReferences = new LocalBuilder[entryPoint.NumCustomParameters]; for (int i = 0; i < numUniformVariables; ++i) { variableReferences[entryPoint.UniformVariables[i].Index - 1] = taskArgumentLocals[i]; } for (int i = 0, e = sharedMemoryLocals.Length; i < e; ++i) { variableReferences[entryPoint.SharedMemoryVariables[i].Index - 1] = sharedMemoryLocals[i]; } // Load kernel arguments foreach (var variableRef in variableReferences) { Debug.Assert(variableRef != null, "Invalid kernel argument"); ilGenerator.Emit(OpCodes.Ldloc, variableRef); } // Invoke kernel ilGenerator.Emit(OpCodes.Call, entryPoint.MethodInfo); } // Synchronize group threads { ilGenerator.MarkLabel(kernelNotInvoked); // Memory barrier for interlocked calls ilGenerator.Emit( OpCodes.Call, typeof(Thread).GetMethod( nameof(Thread.MemoryBarrier), BindingFlags.Public | BindingFlags.Static)); // Wait for other group threads ilGenerator.Emit(OpCodes.Ldarg, 1); ilGenerator.Emit( OpCodes.Call, typeof(Barrier).GetMethod( nameof(Barrier.SignalAndWait), BindingFlags.Public | BindingFlags.Instance, null, new Type[] { }, null)); } // Increase counter { // i += groupSize ilGenerator.Emit(OpCodes.Ldloc, chunkIdxCounter); ilGenerator.Emit(OpCodes.Ldarg, 4); ilGenerator.Emit(OpCodes.Add); ilGenerator.Emit(OpCodes.Stloc, chunkIdxCounter); } // Loop header { ilGenerator.MarkLabel(loopHeader); // if (i < chunkSize) ... ilGenerator.Emit(OpCodes.Ldloc, chunkIdxCounter); ilGenerator.Emit(OpCodes.Ldarg, 7); ilGenerator.Emit(OpCodes.Clt); ilGenerator.Emit(OpCodes.Stloc, breakCondition); ilGenerator.Emit(OpCodes.Ldloc, breakCondition); ilGenerator.Emit(OpCodes.Brtrue, loopBody); } // Emit final return ilGenerator.Emit(OpCodes.Ret); return(execute); }
/// <summary> /// Generates a dynamic kernel-launcher method that will be just-in-time compiled /// during the first invocation. Using the generated launcher lowers the overhead /// for kernel launching dramatically, since unnecessary operations (like boxing) /// can be avoided. /// </summary> /// <param name="kernel">The kernel to generate a launcher for.</param> /// <param name="taskType">The created task.</param> /// <param name="taskArgumentMapping">The created task-argument mapping that maps parameter indices of uniforms</param> /// <param name="customGroupSize">The custom group size for the launching operation.</param> /// <returns>The generated launcher method.</returns> private MethodInfo GenerateKernelLauncherMethod( CompiledKernel kernel, out Type taskType, out FieldInfo[] taskArgumentMapping, int customGroupSize) { var entryPoint = kernel.EntryPoint; AdjustAndVerifyKernelGroupSize(ref customGroupSize, entryPoint); var uniformVariables = entryPoint.UniformVariables; var numUniformVariables = uniformVariables.Length; var kernelParamTypes = entryPoint.CreateCustomParameterTypes(); int numKernelParams = kernelParamTypes.Length; var funcParamTypes = new Type[numKernelParams + Kernel.KernelParameterOffset]; GenerateAcceleratorTask( kernel, kernelParamTypes, out taskType, out ConstructorInfo taskConstructor, out taskArgumentMapping); // Launcher(Kernel, AcceleratorStream, [Index], ...) funcParamTypes[Kernel.KernelInstanceParamIdx] = typeof(Kernel); funcParamTypes[Kernel.KernelStreamParamIdx] = typeof(AcceleratorStream); funcParamTypes[Kernel.KernelParamDimensionIdx] = entryPoint.KernelIndexType; kernelParamTypes.CopyTo(funcParamTypes, Kernel.KernelParameterOffset); // Create the actual launcher method var func = new DynamicMethod( kernel.EntryName, typeof(void), funcParamTypes, typeof(Kernel)); var funcParams = func.GetParameters(); var ilGenerator = func.GetILGenerator(); var cpuKernel = ilGenerator.DeclareLocal(typeof(CPUKernel)); KernelLauncherBuilder.EmitLoadKernelArgument <CPUKernel>(Kernel.KernelInstanceParamIdx, ilGenerator); ilGenerator.Emit(OpCodes.Stloc, cpuKernel); // Create an instance of the custom task type var task = ilGenerator.DeclareLocal(taskType); { var sharedMemSize = KernelLauncherBuilder.EmitSharedMemorySizeComputation( entryPoint, ilGenerator, paramIdx => funcParams[paramIdx + Kernel.KernelParameterOffset]); ilGenerator.Emit(OpCodes.Ldloc, cpuKernel); ilGenerator.Emit( OpCodes.Call, typeof(CPUKernel).GetProperty( nameof(CPUKernel.KernelExecutionDelegate), BindingFlags.NonPublic | BindingFlags.Public | BindingFlags.Instance).GetGetMethod(true)); // Load custom user dimension KernelLauncherBuilder.EmitLoadDimensions( entryPoint, ilGenerator, Kernel.KernelParamDimensionIdx, () => ilGenerator.Emit( OpCodes.Newobj, typeof(Index3).GetConstructor( new Type[] { typeof(int), typeof(int), typeof(int) }))); // Load dimensions as index3 arguments KernelLauncherBuilder.EmitLoadDimensions( entryPoint, ilGenerator, Kernel.KernelParamDimensionIdx, () => ilGenerator.Emit( OpCodes.Newobj, typeof(Index3).GetConstructor( new Type[] { typeof(int), typeof(int), typeof(int) })), customGroupSize); // Load shared-memory size ilGenerator.Emit(OpCodes.Ldloc, sharedMemSize); // Create new task object ilGenerator.Emit(OpCodes.Newobj, taskConstructor); // Store task ilGenerator.Emit(OpCodes.Stloc, task); } // Assign parameters for (int i = 0; i < numUniformVariables; ++i) { ilGenerator.Emit(OpCodes.Ldloc, task); ilGenerator.Emit(OpCodes.Ldarg, i + Kernel.KernelParameterOffset); ilGenerator.Emit(OpCodes.Stfld, taskArgumentMapping[i]); } // Launch task: ((CPUKernel)kernel).CPUAccelerator.Launch(task); ilGenerator.Emit(OpCodes.Ldloc, cpuKernel); ilGenerator.Emit( OpCodes.Call, typeof(CPUKernel).GetProperty( nameof(CPUKernel.CPUAccelerator)).GetGetMethod(false)); ilGenerator.Emit(OpCodes.Ldloc, task); ilGenerator.Emit( OpCodes.Call, typeof(CPUAccelerator).GetMethod( nameof(CPUAccelerator.Launch), BindingFlags.NonPublic | BindingFlags.Instance)); // End of launch method ilGenerator.Emit(OpCodes.Ret); return(func); }