private static void RunKernel(Context context, Device device) { const string resourceName = "RunKernel.sum.cl"; var source = ProgramUtils.GetProgramSourceFromResource(Assembly.GetExecutingAssembly(), resourceName); var program = ProgramUtils.BuildProgramForDevice(context, device, source); ErrorCode errorCode; var kernel = Cl.CreateKernel(program, "sum", out errorCode); errorCode.Check("CreateKernel"); const int size = 1024; var floatsA = Enumerable.Range(1, size).Select(n => (float)n).ToArray(); var floatsB = Enumerable.Range(1, size).Select(n => (float)n).ToArray(); var floatsC = new float[size]; using (var mem1 = new PinnedArrayOfStruct<float>(context, floatsA)) using (var mem2 = new PinnedArrayOfStruct<float>(context, floatsB)) using (var mem3 = new PinnedArrayOfStruct<float>(context, floatsC, MemMode.WriteOnly)) { KernelRunner.RunKernel(context, device, kernel, size, new[] { 2 }, mem1, mem2, mem3); } Console.WriteLine($"floatsC[0]: {floatsC[0]}"); Console.WriteLine($"floatsC[1023]: {floatsC[1023]}"); }
private static void Reduction(Context context, Device device) { const string resourceName = "ReductionVectorComplete.reduction.cl"; var source = ProgramUtils.GetProgramSourceFromResource(Assembly.GetExecutingAssembly(), resourceName); var program = ProgramUtils.BuildProgramForDevice(context, device, source); ErrorCode errorCode; var kernel1 = Cl.CreateKernel(program, "reductionVector", out errorCode); errorCode.Check("CreateKernelsInProgram"); var kernel2 = Cl.CreateKernel(program, "reductionComplete", out errorCode); errorCode.Check("CreateKernelsInProgram"); const int numValues = 1024 * 1024; const int numValuesPerWorkItem = 4; var globalWorkSize = numValues/numValuesPerWorkItem; //var localWorkSize = Cl.GetKernelWorkGroupInfo(kernel1, device, KernelWorkGroupInfo.WorkGroupSize, out errorCode).CastTo<int>(); //errorCode.Check("GetKernelWorkGroupInfo(KernelWorkGroupInfo.WorkGroupSize)"); const int localWorkSize = 32; const int value = 42; const int correctAnswer = numValues * value; var data1 = Enumerable.Repeat(value, numValues).Select(n => (float)n).ToArray(); var data2 = Enumerable.Repeat(0, globalWorkSize/localWorkSize*numValuesPerWorkItem).Select(n => (float)n).ToArray(); var sum = new float[1]; using (var memData1 = new PinnedArrayOfStruct<float>(context, data1, MemMode.ReadWrite)) using (var memData2 = new PinnedArrayOfStruct<float>(context, data2, MemMode.WriteOnly)) using (var memSum = new PinnedArrayOfStruct<float>(context, sum, MemMode.WriteOnly)) { var commandQueue = Cl.CreateCommandQueue(context, device, CommandQueueProperties.ProfilingEnable, out errorCode); errorCode.Check("CreateCommandQueue"); var kernel1Events = new List<Event>(); var memResult = memData2; errorCode = Cl.SetKernelArg<float>(kernel1, 2, localWorkSize * numValuesPerWorkItem); errorCode.Check("SetKernelArg(2)"); foreach (var index in Enumerable.Range(0, int.MaxValue)) { var memDataIn = (index%2 == 0) ? memData1 : memData2; var memDataOut = (index%2 == 0) ? memData2 : memData1; memResult = memDataOut; errorCode = Cl.SetKernelArg(kernel1, 0, memDataIn.Buffer); errorCode.Check("SetKernelArg(0)"); errorCode = Cl.SetKernelArg(kernel1, 1, memDataOut.Buffer); errorCode.Check("SetKernelArg(1)"); Console.WriteLine($"Calling EnqueueNDRangeKernel(kernel1) with globalWorkSize: {globalWorkSize}; localWorkSize: {localWorkSize}; num work groups: {globalWorkSize/localWorkSize}"); Event e; errorCode = Cl.EnqueueNDRangeKernel( commandQueue, kernel1, 1, // workDim null, // globalWorkOffset new[] { (IntPtr)globalWorkSize }, new[] { (IntPtr)localWorkSize }, 0, // numEventsInWaitList null, // eventWaitList out e); errorCode.Check("EnqueueNDRangeKernel"); kernel1Events.Add(e); globalWorkSize /= localWorkSize; if (globalWorkSize <= localWorkSize) break; } errorCode = Cl.SetKernelArg(kernel2, 0, memResult.Buffer); errorCode.Check("SetKernelArg(0)"); errorCode = Cl.SetKernelArg<float>(kernel2, 1, localWorkSize * numValuesPerWorkItem); errorCode.Check("SetKernelArg(1)"); errorCode = Cl.SetKernelArg(kernel2, 2, memSum.Buffer); errorCode.Check("SetKernelArg(2)"); Console.WriteLine($"Calling EnqueueNDRangeKernel(kernel2) with globalWorkSize: {globalWorkSize}"); Event kernel2Event; errorCode = Cl.EnqueueNDRangeKernel( commandQueue, kernel2, 1, // workDim null, // globalWorkOffset new[] { (IntPtr)globalWorkSize }, /* * Force the use of a single work group by setting 'localWorkSize' * to be the same as 'globalWorkSize'. Without doing this, * "Experimental OpenCL 2.0 CPU Only Platform" seems to use 8 work groups, * each with a local size of 1, which means that we end up with an incorrect * result. */ new[] { (IntPtr)globalWorkSize }, 0, // numEventsInWaitList null, // eventWaitList out kernel2Event); errorCode.Check("EnqueueNDRangeKernel"); Event readEvent; errorCode = Cl.EnqueueReadBuffer( commandQueue, memSum.Buffer, Bool.False, // blockingRead IntPtr.Zero, // offsetInBytes (IntPtr)memSum.Size, memSum.Handle, 0, // numEventsInWaitList null, // eventWaitList out readEvent); errorCode.Check("EnqueueReadBuffer"); errorCode = Cl.Finish(commandQueue); errorCode.Check("Finish"); var start1 = Cl.GetEventProfilingInfo(kernel1Events.First(), ProfilingInfo.Start, out errorCode).CastTo<long>(); errorCode.Check("GetEventProfilingInfo(ProfilingInfo.Start)"); var end1 = Cl.GetEventProfilingInfo(kernel2Event, ProfilingInfo.End, out errorCode).CastTo<long>(); errorCode.Check("GetEventProfilingInfo(ProfilingInfo.End)"); var start2 = Cl.GetEventProfilingInfo(readEvent, ProfilingInfo.Start, out errorCode).CastTo<long>(); errorCode.Check("GetEventProfilingInfo(ProfilingInfo.Start)"); var end2 = Cl.GetEventProfilingInfo(readEvent, ProfilingInfo.End, out errorCode).CastTo<long>(); errorCode.Check("GetEventProfilingInfo(ProfilingInfo.End)"); Console.WriteLine($"kernel1/kernel2 elapsed time: {end1 - start1:N0}ns"); Console.WriteLine($"read buffer elapsed time: {end2 - start2:N0}ns"); } Console.WriteLine($"OpenCL final answer: {Math.Truncate(sum[0]):N0}; Correct answer: {correctAnswer:N0}"); }
private static void Reduction(Context context, Device device) { const string resourceName = "ReductionScalar.reduction.cl"; var source = ProgramUtils.GetProgramSourceFromResource(Assembly.GetExecutingAssembly(), resourceName); var program = ProgramUtils.BuildProgramForDevice(context, device, source); ErrorCode errorCode; var kernel = Cl.CreateKernel(program, "reductionScalar", out errorCode); errorCode.Check("CreateKernel"); const int numValues = 1024 * 1024; const int numValuesPerWorkItem = 1; const int globalWorkSize = numValues/numValuesPerWorkItem; var localWorkSize = Cl.GetKernelWorkGroupInfo(kernel, device, KernelWorkGroupInfo.WorkGroupSize, out errorCode).CastTo<int>(); errorCode.Check("GetKernelWorkGroupInfo(KernelWorkGroupInfo.WorkGroupSize)"); Console.WriteLine($"localWorkSize: {localWorkSize}"); var numWorkGroups = globalWorkSize/localWorkSize; const int value = 42; const int correctAnswer = numValues*value; var data = Enumerable.Repeat(value, globalWorkSize).Select(n => (float) n).ToArray(); var workGroupResults = new float[numWorkGroups*numValuesPerWorkItem]; using (var mem1 = new PinnedArrayOfStruct<float>(context, data)) using (var mem2 = new PinnedArrayOfStruct<float>(context, workGroupResults, MemMode.WriteOnly)) { var commandQueue = Cl.CreateCommandQueue(context, device, CommandQueueProperties.ProfilingEnable, out errorCode); errorCode.Check("CreateCommandQueue"); errorCode = Cl.SetKernelArg(kernel, 0, mem1.Buffer); errorCode.Check("SetKernelArg(0)"); errorCode = Cl.SetKernelArg<float>(kernel, 1, localWorkSize); errorCode.Check("SetKernelArg(1)"); errorCode = Cl.SetKernelArg(kernel, 2, mem2.Buffer); errorCode.Check("SetKernelArg(2)"); Event e1; errorCode = Cl.EnqueueNDRangeKernel( commandQueue, kernel, 1, // workDim null, // globalWorkOffset new[] {(IntPtr) globalWorkSize}, new[] {(IntPtr) localWorkSize}, 0, // numEventsInWaitList null, // eventWaitList out e1); errorCode.Check("EnqueueNDRangeKernel"); Event e2; errorCode = Cl.EnqueueReadBuffer( commandQueue, mem2.Buffer, Bool.False, // blockingRead IntPtr.Zero, // offsetInBytes (IntPtr)mem2.Size, mem2.Handle, 0, // numEventsInWaitList null, // eventWaitList out e2); errorCode.Check("EnqueueReadBuffer"); var evs = new[] {e2}; errorCode = Cl.WaitForEvents((uint)evs.Length, evs); errorCode.Check("WaitForEvents"); var start1 = Cl.GetEventProfilingInfo(e1, ProfilingInfo.Start, out errorCode).CastTo<long>(); errorCode.Check("GetEventProfilingInfo(ProfilingInfo.Start)"); var end1 = Cl.GetEventProfilingInfo(e1, ProfilingInfo.End, out errorCode).CastTo<long>(); errorCode.Check("GetEventProfilingInfo(ProfilingInfo.End)"); var start2 = Cl.GetEventProfilingInfo(e2, ProfilingInfo.Start, out errorCode).CastTo<long>(); errorCode.Check("GetEventProfilingInfo(ProfilingInfo.Start)"); var end2 = Cl.GetEventProfilingInfo(e2, ProfilingInfo.End, out errorCode).CastTo<long>(); errorCode.Check("GetEventProfilingInfo(ProfilingInfo.End)"); Console.WriteLine($"e1 elapsed time: {end1 - start1:N0}ns"); Console.WriteLine($"e2 elapsed time: {end2 - start2:N0}ns"); } var finalAnswer = Math.Truncate(workGroupResults.Sum()); Console.WriteLine($"OpenCL final answer: {finalAnswer:N0}; Correct answer: {correctAnswer:N0}"); }