public void ExternalLoopBody(Cl.Program program) { // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "ExternalLoopBody", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // allocate host vectors int[] hres = { 0, 1, 2, 3, 4, 5 }; // allocate device vectors Cl.Mem dres = Cl.CreateBuffer(context, Cl.MemFlags.ReadWrite | Cl.MemFlags.CopyHostPtr, (IntPtr)(sizeof(int) * hres.Length), hres, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dres)); clSafeCall(Cl.SetKernelArg(kernel, 1, hres.Length)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dres, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(int) * hres.Length), hres, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(new[] { 1, 4, 3, 6, 5, 8 }, hres); }
public void SetUp() { device = (from platformid in Cl.GetPlatformIDs(out error) from deviceid in Cl.GetDeviceIDs(platformid, Cl.DeviceType.Gpu, out error) select deviceid).First(); context = Cl.CreateContext(null, 1, new[] { device }, null, IntPtr.Zero, out error); dummy = Cl.CreateBuffer(context, Cl.MemFlags.ReadOnly, IntPtr.Zero, IntPtr.Zero, out error); }
public void ArrayCompare(Cl.Program program) { // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "ArrayCompare", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // allocate host vectors bool[] res = { true, false, true, false }; // allocate device vectors Cl.Mem dp1 = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(int)), IntPtr.Zero, out error); clSafeCall(error); Cl.Mem dp2 = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(int)), IntPtr.Zero, out error); clSafeCall(error); Cl.Mem dp3 = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(bool) * res.Length), IntPtr.Zero, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dp1)); clSafeCall(Cl.SetKernelArg(kernel, 1, dp2)); clSafeCall(Cl.SetKernelArg(kernel, 2, dp3)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dp3, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(bool) * res.Length), res, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(new[] { false, true, false, true }, res); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dummy)); clSafeCall(Cl.SetKernelArg(kernel, 1, dummy)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dp3, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(bool) * res.Length), res, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(new[] { true, false, true, false }, res); }
public void ArrayRefOut(Cl.Program program) { // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "ArrayRefOut", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // allocate host vectors int[] hp1 = { 1 }; int[] hp2 = { 2 }; // allocate device vectors Cl.Mem dp1 = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadWrite, (IntPtr)(sizeof(int) * hp1.Length), hp1, out error); clSafeCall(error); Cl.Mem dp2 = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadWrite, (IntPtr)(sizeof(int) * hp2.Length), hp2, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dp1)); clSafeCall(Cl.SetKernelArg(kernel, 1, dp2)); clSafeCall(Cl.SetKernelArg(kernel, 2, dummy)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dp1, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(int) * hp1.Length), hp1, 0, null, out clevent)); clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dp2, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(int) * hp1.Length), hp2, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(5, hp1[0]); Assert.AreEqual(4, hp2[0]); }
public void PoissonJacobi() { if (!prepared) { Prepare(this.BuildIR().InlineIR()); prepared = true; } // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "PoissonJacobi", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // initialize host memory uint dimX = 162; uint dimY = 122; uint N = 15000; float x0 = (float)(-0.25 * Math.PI); float y0 = (float)(-0.25 * Math.PI); float hx = 2.0f * Math.Abs(x0) / dimX; float hy = 2.0f * Math.Abs(y0) / dimY; float[] hData = new float[dimX * dimY]; uint stride = dimX; //boundary values for (uint i = 1; i < dimY - 1; i++) { uint y_idx = i * stride; float y_val = y0 + i * hy; hData[y_idx] = u(x0, y_val); hData[y_idx + dimX - 1] = u(x0 + (dimX - 1) * hx, y_val); } for (uint j = 1; j < dimX - 1; j++) { float x_val = x0 + j * hx; hData[j] = u(x_val, y0); hData[j + (dimY - 1) * stride] = u(x_val, y0 + (dimY - 1) * hy); } // allocate device vectors Cl.Mem input = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadWrite, (IntPtr)(sizeof(float) * hData.Length), hData, out error); clSafeCall(error); Cl.Mem output = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadWrite, (IntPtr)(sizeof(float) * hData.Length), hData, out error); clSafeCall(error); float a1 = 2 * hy / hx; float a2 = 2 * hx / hy; float a3 = a1; float a4 = a2; float a = a1 + a2 + a3 + a4; // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 2, (AREA_SIZE_Y + 2) * (AREA_SIZE_X + 2) * sizeof(float), null)); clSafeCall(Cl.SetKernelArg(kernel, 3, dimX)); clSafeCall(Cl.SetKernelArg(kernel, 4, dimY)); clSafeCall(Cl.SetKernelArg(kernel, 5, stride)); clSafeCall(Cl.SetKernelArg(kernel, 6, a1)); clSafeCall(Cl.SetKernelArg(kernel, 7, a2)); clSafeCall(Cl.SetKernelArg(kernel, 8, a3)); clSafeCall(Cl.SetKernelArg(kernel, 9, a4)); clSafeCall(Cl.SetKernelArg(kernel, 10, a)); clSafeCall(Cl.SetKernelArg(kernel, 11, hx)); clSafeCall(Cl.SetKernelArg(kernel, 12, hy)); clSafeCall(Cl.SetKernelArg(kernel, 13, x0)); clSafeCall(Cl.SetKernelArg(kernel, 14, y0)); IntPtr[] lo = { (IntPtr)16, (IntPtr)16 }; IntPtr[] gl = { (IntPtr)((dimX - 2 + AREA_SIZE_X - 1) / AREA_SIZE_X * 16), (IntPtr)((dimY - 2 + AREA_SIZE_Y - 1) / AREA_SIZE_Y * 16) }; Cl.Mem curIn = input; Cl.Mem curOut = output; // execute kernel (and perform data transfering silently) clSafeCall(Cl.SetKernelArg(kernel, 0, curIn)); clSafeCall(Cl.SetKernelArg(kernel, 1, curOut)); clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, gl, lo, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); for (uint idx = 1; idx < N; idx++) { // swap buffers Cl.Mem temp = curIn; curIn = curOut; curOut = temp; // execute kernel clSafeCall(Cl.SetKernelArg(kernel, 0, curIn)); clSafeCall(Cl.SetKernelArg(kernel, 1, curOut)); clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, gl, lo, 0, null, out clevent)); } clSafeCall(Cl.Finish(cmdQueue)); stopwatch.Stop(); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, curOut, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(float) * hData.Length), hData, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); float avgerr = 0, maxerr = 0; for (uint i = 1; i < dimY - 1; i++) { for (uint j = 1; j < dimX - 1; j++) { float theory = u(x0 + j * hx, y0 + i * hy); float err = Math.Abs(theory - hData[j + i * stride]) / Math.Abs(theory); avgerr += err; maxerr = Math.Max(maxerr, err); } } avgerr /= dimX * dimY; long elapsedTime = stopwatch.ElapsedMilliseconds; double dataSizePerIteration = dimX * dimY * 2 * sizeof(float); double dataSizeTotal = dataSizePerIteration * N; double elapsedSeconds = elapsedTime * 0.001; double gigabyteFactor = 1 << 30; double bandwidth = dataSizeTotal / (gigabyteFactor * elapsedSeconds); Console.WriteLine("avgerr = {0} maxerr = {1} elapsedTime = {2} ms bandwidth = {3} GB/s", avgerr, maxerr, elapsedTime, bandwidth); Assert.That(maxerr, Is.LessThanOrEqualTo(5E-2F)); Assert.That(avgerr, Is.LessThanOrEqualTo(1E-2F)); }
public void MatMul() { if (!prepared) { Prepare(this.BuildIR().InlineIR()); prepared = true; } // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "MatMul", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // allocate host matrices float[] A = new float[WA * HA]; float[] B = new float[WB * HB]; float[] C = new float[WC * HC]; // initialize host memory Random rand = new Random(); for (int i = 0; i < A.Length; i++) { A[i] = (float)rand.Next() / short.MaxValue; } for (int i = 0; i < B.Length; i++) { B[i] = (float)rand.Next() / short.MaxValue; } // allocate device vectors Cl.Mem hDeviceMemA = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadOnly, (IntPtr)(sizeof(float) * A.Length), A, out error); clSafeCall(error); Cl.Mem hDeviceMemB = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadOnly, (IntPtr)(sizeof(float) * B.Length), B, out error); clSafeCall(error); Cl.Mem hDeviceMemC = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(float) * C.Length), IntPtr.Zero, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, hDeviceMemA)); clSafeCall(Cl.SetKernelArg(kernel, 1, hDeviceMemB)); clSafeCall(Cl.SetKernelArg(kernel, 2, hDeviceMemC)); clSafeCall(Cl.SetKernelArg(kernel, 3, BLOCK_SIZE * BLOCK_SIZE * sizeof(float), null)); clSafeCall(Cl.SetKernelArg(kernel, 4, BLOCK_SIZE * BLOCK_SIZE * sizeof(float), null)); clSafeCall(Cl.SetKernelArg(kernel, 5, WA)); clSafeCall(Cl.SetKernelArg(kernel, 6, WB)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, new[] { (IntPtr)WC, (IntPtr)HC }, new[] { (IntPtr)BLOCK_SIZE, (IntPtr)BLOCK_SIZE }, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, hDeviceMemC, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(float) * C.Length), C, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); for (int i = 0; i < HA; ++i) { for (int j = 0; j < WB; ++j) { float sum = 0; for (int k = 0; k < WA; ++k) { sum += A[i * WA + k] * B[k * WB + j]; } float err = Math.Abs((sum - C[i * WB + j]) / sum); Assert.That(err, Is.LessThanOrEqualTo(1E-3F)); } } }
public void VecAdd() { if (!prepared) { Prepare(this.BuildIR().InlineIR()); prepared = true; } // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "VecAdd", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); int length = 1 << 10; // allocate host vectors float[] A = new float[length]; float[] B = new float[length]; float[] C = new float[length]; // initialize host memory Random rand = new Random(); for (int i = 0; i < length; i++) { A[i] = (float)rand.Next() / short.MaxValue; B[i] = (float)rand.Next() / short.MaxValue; } // allocate device vectors Cl.Mem hDeviceMemA = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadOnly, (IntPtr)(sizeof(float) * length), A, out error); clSafeCall(error); Cl.Mem hDeviceMemB = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadOnly, (IntPtr)(sizeof(float) * length), B, out error); clSafeCall(error); Cl.Mem hDeviceMemC = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(float) * length), IntPtr.Zero, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, hDeviceMemA)); clSafeCall(Cl.SetKernelArg(kernel, 1, hDeviceMemB)); clSafeCall(Cl.SetKernelArg(kernel, 2, hDeviceMemC)); clSafeCall(Cl.SetKernelArg(kernel, 3, length)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)length }, new[] { (IntPtr)256 }, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, hDeviceMemC, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(float) * length), C, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); for (int i = 0; i < length; i++) { float sum = A[i] + B[i]; float err = Math.Abs((sum - C[i]) / sum); Assert.That(err, Is.LessThanOrEqualTo(1E-3F)); } }
public void BlockMutation() { AssemblyName assemblyName = new AssemblyName("UniGPUTestFixture"); AssemblyBuilder assemblyBuilder = AppDomain.CurrentDomain.DefineDynamicAssembly(assemblyName, AssemblyBuilderAccess.RunAndSave); ModuleBuilder moduleBuilder = assemblyBuilder.DefineDynamicModule(assemblyName.Name, assemblyName.Name + ".dll"); TypeBuilder typeBuilder = moduleBuilder.DefineType("CILBBTypeMutation", TypeAttributes.Public); MethodBuilder methodBuilder = typeBuilder.DefineMethod("TestCase", MethodAttributes.Public | MethodAttributes.Static, typeof(void), new Type[] { typeof(int), typeof(int[]) }); methodBuilder.DefineParameter(1, ParameterAttributes.None, "arg"); methodBuilder.DefineParameter(2, ParameterAttributes.None, "addr"); ILGenerator il = methodBuilder.GetILGenerator(); LocalBuilder lb = il.DeclareLocal(typeof(float)); Label ZERO = il.DefineLabel(); Label LOOP = il.DefineLabel(); Label LOOP_FLT_MUTATOR = il.DefineLabel(); Label LOOP_INT_MUTATOR = il.DefineLabel(); il.Emit(OpCodes.Ldarg_1); il.Emit(OpCodes.Ldc_I4_0); il.Emit(OpCodes.Ldarg_0); il.Emit(OpCodes.Ldarg_0); il.Emit(OpCodes.Brfalse, ZERO); il.MarkLabel(LOOP); il.Emit(OpCodes.Conv_I2); il.Emit(OpCodes.Starg, 0); il.Emit(OpCodes.Ldarga, 0); il.Emit(OpCodes.Dup); il.Emit(OpCodes.Ldind_I4); il.Emit(OpCodes.Dup); il.Emit(OpCodes.Ldc_I4_2); il.Emit(OpCodes.Rem); il.Emit(OpCodes.Not); il.Emit(OpCodes.Ldc_I4_1); il.Emit(OpCodes.And); il.Emit(OpCodes.Brtrue, LOOP_FLT_MUTATOR); il.MarkLabel(LOOP_INT_MUTATOR); il.Emit(OpCodes.Conv_I4); il.Emit(OpCodes.Starg, 0); il.Emit(OpCodes.Ldind_I4); il.Emit(OpCodes.Ldarg_0); il.Emit(OpCodes.Add); il.Emit(OpCodes.Ldc_I4_2); il.Emit(OpCodes.Div); il.Emit(OpCodes.Ldc_I4_M1); il.Emit(OpCodes.Neg); il.Emit(OpCodes.Sub); il.Emit(OpCodes.Dup); il.Emit(OpCodes.Ldc_I4_1); il.Emit(OpCodes.Bge, LOOP); il.Emit(OpCodes.Br, ZERO); il.MarkLabel(LOOP_FLT_MUTATOR); il.Emit(OpCodes.Conv_R4); il.Emit(OpCodes.Stloc_0); il.Emit(OpCodes.Pop); il.Emit(OpCodes.Ldloc_0); il.Emit(OpCodes.Ldc_R4, 1.0f); il.Emit(OpCodes.Sub); il.Emit(OpCodes.Dup); il.Emit(OpCodes.Ldc_R4, 1.0f); il.Emit(OpCodes.Bge, LOOP); il.Emit(OpCodes.Conv_I4); il.MarkLabel(ZERO); il.Emit(OpCodes.Ldc_I4_1); il.Emit(OpCodes.Add); il.Emit(OpCodes.Stelem_I4); il.Emit(OpCodes.Ret); MethodInfo method = typeBuilder.CreateType().GetMethod(methodBuilder.Name); int[] res = { 0 }; method.Invoke(null, new object[] { 8, res }); //Assert.AreEqual(1, res[0]); Cl.Program program = method.BuildIR().ToGPUClProgram(device, context); clSafeCall(Cl.BuildProgram(program, 1, new[] { device }, string.Empty, null, IntPtr.Zero)); Assert.AreEqual(Cl.BuildStatus.Success, Cl.GetProgramBuildInfo(program, device, Cl.ProgramBuildInfo.Status, out error). CastTo <Cl.BuildStatus>()); Cl.Kernel kernel = Cl.CreateKernel(program, "TestCase", out error); clSafeCall(error); Cl.Mem cl_res = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)sizeof(int), IntPtr.Zero, out error); clSafeCall(error); Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, (Cl.CommandQueueProperties) 0, out error); clSafeCall(error); clSafeCall(Cl.SetKernelArg(kernel, 0, 8)); clSafeCall(Cl.SetKernelArg(kernel, 1, cl_res)); clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, cl_res, Cl.Bool.True, IntPtr.Zero, (IntPtr)sizeof(int), res, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); clSafeCall(Cl.ReleaseMemObject(cl_res)); program.Dispose(); Assert.AreEqual(1, res[0]); }
public void SmallTypes(Cl.Program program) { // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "SmallTypes", out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); // allocate host vectors short[] hres1 = { 0 }; short[] hres2 = { 0 }; // allocate device vectors Cl.Mem dres1 = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(short) * hres1.Length), IntPtr.Zero, out error); clSafeCall(error); Cl.Mem dres2 = Cl.CreateBuffer(context, Cl.MemFlags.WriteOnly, (IntPtr)(sizeof(short) * hres2.Length), IntPtr.Zero, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dres1)); clSafeCall(Cl.SetKernelArg(kernel, 1, dres2)); clSafeCall(Cl.SetKernelArg(kernel, 2, (byte)1)); clSafeCall(Cl.SetKernelArg(kernel, 3, (sbyte)-20)); clSafeCall(Cl.SetKernelArg(kernel, 4, (ushort)30)); clSafeCall(Cl.SetKernelArg(kernel, 5, (short)-4)); clSafeCall(Cl.SetKernelArg(kernel, 6, true)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dres1, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(short) * hres1.Length), hres1, 0, null, out clevent)); clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dres2, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(short) * hres1.Length), hres2, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(7, hres1[0]); Assert.AreEqual(-7, hres2[0]); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 6, false)); // execute kernel clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 1, null, new[] { (IntPtr)1 }, null, 0, null, out clevent)); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dres1, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(short) * hres1.Length), hres1, 0, null, out clevent)); clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dres2, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(short) * hres1.Length), hres2, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Assert.AreEqual(-7, hres1[0]); Assert.AreEqual(7, hres2[0]); }
private static long PoissonRBSOR(Cl.Device device, Cl.Context context, Cl.Program program, bool lmem, float x0, float y0, float x1, float y1, int dimX, int dimY, int N, float omega, string fileName = null, string options = "") { Cl.ErrorCode error; Cl.Event clevent; // build program clSafeCall(Cl.BuildProgram(program, 1, new[] { device }, options, null, IntPtr.Zero)); Cl.BuildStatus status = Cl.GetProgramBuildInfo(program, device, Cl.ProgramBuildInfo.Status, out error).CastTo <Cl.BuildStatus>(); if (status != Cl.BuildStatus.Success) { throw new Exception(status.ToString()); } // save binary if (fileName != null) { Cl.InfoBuffer binarySizes = Cl.GetProgramInfo(program, Cl.ProgramInfo.BinarySizes, out error); clSafeCall(error); Cl.InfoBufferArray binaries = new Cl.InfoBufferArray( binarySizes.CastToEnumerable <IntPtr>(Enumerable.Range(0, 1)).Select(sz => new Cl.InfoBuffer(sz)).ToArray()); IntPtr szRet; clSafeCall(Cl.GetProgramInfo(program, Cl.ProgramInfo.Binaries, binaries.Size, binaries, out szRet)); byte[] binary = binaries[0].CastToArray <byte>(binarySizes.CastTo <IntPtr>(0).ToInt32()); File.WriteAllBytes(fileName, binary); } // create kernel Cl.Kernel kernel = Cl.CreateKernel(program, "PoissonRBSOR" + (lmem ? "_LMem" : ""), out error); clSafeCall(error); // create command queue Cl.CommandQueue cmdQueue = Cl.CreateCommandQueue(context, device, Cl.CommandQueueProperties.None, out error); clSafeCall(error); float hx = (x1 - x0) / dimX; float hy = (y1 - y0) / dimY; // boundary values float[] hgrid = new float[dimX * dimY]; int gstride = dimX; for (int i = 1; i < dimY - 1; i++) { int y_idx = i * gstride; float y_val = y0 + i * hy; hgrid[y_idx] = u(x0, y_val); hgrid[y_idx + dimX - 1] = u(x0 + (dimX - 1) * hx, y_val); } for (int j = 1; j < dimX - 1; j++) { float x_val = x0 + j * hx; hgrid[j] = u(x_val, y0); hgrid[j + (dimY - 1) * gstride] = u(x_val, y0 + (dimY - 1) * hy); } // laplacian values float[] hlaplacian = new float[(dimX - 2) * (dimY - 2)]; int lstride = dimX - 2; for (int i = 1; i < dimY - 1; i++) { for (int j = 1; j < dimX - 1; j++) { hlaplacian[j - 1 + (i - 1) * lstride] = J(x0 + j * hx, y0 + i * hy); } } // allocate device vectors Cl.Mem dgrid = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadWrite, (IntPtr)(sizeof(float) * hgrid.Length), hgrid, out error); clSafeCall(error); Cl.Mem dlaplacian = Cl.CreateBuffer(context, Cl.MemFlags.CopyHostPtr | Cl.MemFlags.ReadOnly, (IntPtr)(sizeof(float) * hlaplacian.Length), hlaplacian, out error); clSafeCall(error); // setup kernel arguments clSafeCall(Cl.SetKernelArg(kernel, 0, dgrid)); clSafeCall(Cl.SetKernelArg(kernel, 1, dlaplacian)); clSafeCall(Cl.SetKernelArg(kernel, 2, dimX)); clSafeCall(Cl.SetKernelArg(kernel, 3, dimY)); clSafeCall(Cl.SetKernelArg(kernel, 4, gstride)); clSafeCall(Cl.SetKernelArg(kernel, 5, lstride)); clSafeCall(Cl.SetKernelArg(kernel, 6, hx)); clSafeCall(Cl.SetKernelArg(kernel, 7, hy)); clSafeCall(Cl.SetKernelArg(kernel, 8, omega)); if (lmem) { clSafeCall(Cl.SetKernelArg(kernel, 10, (AREA_SIZE_Y + 2) * (AREA_SIZE_X + 2) * sizeof(float), null)); } IntPtr[] lo = { (IntPtr)TILE_SIZE_X, (IntPtr)TILE_SIZE_Y }; IntPtr[] gl = { (IntPtr)((dimX - 2 + (lmem ? AREA_SIZE_X : TILE_SIZE_X) - 1) / (lmem ? AREA_SIZE_X : TILE_SIZE_X) * TILE_SIZE_X), (IntPtr)((dimY - 2 + (lmem ? AREA_SIZE_Y : TILE_SIZE_Y) - 1) / (lmem ? AREA_SIZE_Y : TILE_SIZE_Y) * TILE_SIZE_Y) }; // execute RED kernel clSafeCall(Cl.SetKernelArg(kernel, 9, 1)); clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, gl, lo, 0, null, out clevent)); // execute BLACK kernel clSafeCall(Cl.SetKernelArg(kernel, 9, 0)); clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, gl, lo, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); Stopwatch stopwatch = new Stopwatch(); stopwatch.Start(); for (int idx = 1; idx < N; idx++) { // execute RED kernel clSafeCall(Cl.SetKernelArg(kernel, 9, 1)); clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, gl, lo, 0, null, out clevent)); // execute BLACK kernel clSafeCall(Cl.SetKernelArg(kernel, 9, 0)); clSafeCall(Cl.EnqueueNDRangeKernel(cmdQueue, kernel, 2, null, gl, lo, 0, null, out clevent)); } clSafeCall(Cl.Finish(cmdQueue)); stopwatch.Stop(); // copy results from device back to host clSafeCall(Cl.EnqueueReadBuffer(cmdQueue, dgrid, Cl.Bool.True, IntPtr.Zero, (IntPtr)(sizeof(float) * hgrid.Length), hgrid, 0, null, out clevent)); clSafeCall(Cl.Finish(cmdQueue)); cmdQueue.Dispose(); kernel.Dispose(); dgrid.Dispose(); float avgerr = 0, maxerr = 0; for (int i = 1; i < dimY - 1; i++) { for (int j = 1; j < dimX - 1; j++) { float theory = u(x0 + j * hx, y0 + i * hy); float err = Math.Abs(theory - hgrid[j + i * gstride]) / Math.Abs(theory); avgerr += err; maxerr = Math.Max(maxerr, err); } } avgerr /= dimX * dimY; long elapsedTime = stopwatch.ElapsedMilliseconds; Console.WriteLine("average error = {0}%\nmaximal error = {1}%\nelapsed time: {2}ms\niterations per second: {3}", avgerr * 100, maxerr * 100, elapsedTime, (double)N / (double)elapsedTime * 1000.0d); return(elapsedTime); }