/// <summary cref="LLVMBackend.CreateEntry(CompileUnit, EntryPoint, out string)"/> internal override LLVMValueRef CreateEntry(CompileUnit unit, EntryPoint entryPoint, out string entryPointName) { if (!ptxDeviceFunctions.TryGetValue(unit, out PTXDeviceFunctions deviceFunctions)) { throw new InvalidOperationException(ErrorMessages.NotSupportedCompileUnit); } entryPointName = unit.GetLLVMName(entryPoint.MethodInfo, CudaKernelCategory); var context = unit.LLVMContext; var module = unit.LLVMModule; LLVMValueRef cudaEntryPoint = GetNamedFunction(module, entryPointName); if (cudaEntryPoint.Pointer != IntPtr.Zero) { SetLinkage(cudaEntryPoint, LLVMLinkage.LLVMExternalLinkage); return(cudaEntryPoint); } var entryPointType = CreatePTXKernelFunctionType(unit, entryPoint, out int parameterOffset); cudaEntryPoint = AddFunction(module, entryPointName, entryPointType); SetLinkage(cudaEntryPoint, LLVMLinkage.LLVMExternalLinkage); var entryBlock = AppendBasicBlock(cudaEntryPoint, "Main"); var exitBlock = AppendBasicBlock(cudaEntryPoint, "Exit"); var builder = CreateBuilderInContext(unit.LLVMContext); PositionBuilderAtEnd(builder, entryBlock); // Create a proper entry point for the virtual entry point var indexValue = CreateIndexValue(unit, entryPoint, builder, deviceFunctions); var groupIndexValue = CreateGroupIndexValue(unit, entryPoint, builder, deviceFunctions); if (!entryPoint.IsGroupedIndexEntry) { // We have to generate code for an implictly grouped kernel // -> Compute the actual global idx indexValue = CreateGlobalIndexValue( unit, entryPoint, builder, deviceFunctions, indexValue, groupIndexValue); // Append a new main block that contains the actual body var mainBlock = AppendBasicBlock(cudaEntryPoint, "Core"); // Emit the required check (custom dimension size is stored in parameter 0). // This check is required to ensure that the index is always smaller than the // specified user size. Otherwise, the index might be larger due to custom blocking! Debug.Assert(parameterOffset > 0); var rangeComparisonResult = CreateGlobalIndexRangeComparison( unit, entryPoint, builder, deviceFunctions, indexValue, GetParam(cudaEntryPoint, 0)); BuildCondBr(builder, rangeComparisonResult, mainBlock, exitBlock); // Move builder to main block to emit the actual kernel body PositionBuilderAtEnd(builder, mainBlock); } else { Debug.Assert(parameterOffset < 1); indexValue = CreateGroupedIndex( unit, entryPoint, builder, deviceFunctions, indexValue, groupIndexValue); } // Call the virtual entry point LLVMValueRef[] kernelValues = new LLVMValueRef[entryPoint.NumCustomParameters + 1]; kernelValues[0] = indexValue; var kernelParameters = GetParams(cudaEntryPoint); var uniformVariables = entryPoint.UniformVariables; for (int i = 0, kernelParamIdx = parameterOffset, e = uniformVariables.Length; i < e; ++i, ++kernelParamIdx) { var variable = uniformVariables[i]; LLVMValueRef kernelParam; var kernelValue = kernelParam = kernelParameters[kernelParamIdx]; if (variable.VariableType.IsPassedViaPtr()) { // We have to generate a local alloca and store the current parameter value kernelValue = BuildAlloca(builder, TypeOf(kernelParam), string.Empty); BuildStore(builder, kernelParam, kernelValue); } kernelValues[variable.Index] = kernelValue; } var sharedMemoryVariables = entryPoint.SharedMemoryVariables; foreach (var variable in sharedMemoryVariables) { // This type can be: ArrayType<T> or VariableType<T> var variableType = unit.GetType(variable.Type); var variableElementType = unit.GetType(variable.ElementType); var sharedVariable = GetUndef(variableType); if (variable.IsArray) { // However, ArrayType<T> encapsulates the type ArrayView<T, Index> var genericArrayView = GetUndef(GetStructElementTypes(variableType)[0]); var arrayType = ArrayType(variableElementType, variable.Count != null ? variable.Count.Value : 0); var sharedMem = DeclareSharedMemoryVariable(unit, builder, arrayType); genericArrayView = BuildInsertValue(builder, genericArrayView, sharedMem, 0, string.Empty); LLVMValueRef intIndex; if (variable.Count != null) { intIndex = ConstInt(context.Int32Type, variable.Count.Value, false); } else { // Attach the right length information that is given via a parameter Debug.Assert(variable.SharedMemoryIndex >= 0); intIndex = kernelParameters[uniformVariables.Length + variable.SharedMemoryIndex]; } var indexInstance = GetUndef(unit.GetType(typeof(Index))); indexInstance = BuildInsertValue(builder, indexInstance, intIndex, 0, string.Empty); genericArrayView = BuildInsertValue(builder, genericArrayView, indexInstance, 1, string.Empty); sharedVariable = BuildInsertValue(builder, sharedVariable, genericArrayView, 0, string.Empty); } else { var sharedMem = DeclareSharedMemoryVariable(unit, builder, variableElementType); // Insert pointer into variable view sharedVariable = BuildInsertValue(builder, sharedVariable, sharedMem, 0, string.Empty); } // Setup the pointer as generic pointer kernelValues[variable.Index] = sharedVariable; } // Declare external entry point var virtualEntryPoint = unit.GetMethod(entryPoint.MethodInfo); BuildCall(builder, virtualEntryPoint.LLVMFunction, kernelValues); // Verify method access in the scope of implicitly-grouped kernels if (!entryPoint.IsGroupedIndexEntry) { virtualEntryPoint.VisitCalls((instruction, calledMethod) => { CodeGenerator.VerifyAccessToMethodInImplicitlyGroupedKernel( unit.CompilationContext, calledMethod.MethodBase, entryPoint); }); } // Jump to exit block BuildBr(builder, exitBlock); // Build exit block PositionBuilderAtEnd(builder, exitBlock); BuildRetVoid(builder); unit.Optimize(); return(cudaEntryPoint); }