private void init(IPartitioning p, cl_program program) { clscale = cl.CreateKernel(program, "scale"); clacc = cl.CreateKernel(program, "acc"); clmew = cl.CreateKernel(program, "mew"); cldnrm2 = cl.CreateKernel(program, "dnrm2"); clinnerprod = cl.CreateKernel(program, "innerprod"); size = p.LocalLength; globalsize = size; int m = globalsize % localsize; if (m > 0) { globalsize += localsize - m; } globalsizehalf = globalsize / 2; m = globalsizehalf % localsize; if (m > 0) { globalsizehalf += localsize - m; } groups = globalsizehalf / localsize; }
/// <summary> /// Create matrix /// </summary> /// <param name="M">Original matrix</param> /// <param name="device">Corresponding OpenCL device</param> /// <param name="kernelName">Name of the kernel function</param> public clMatrix(MsrMatrix M, clDevice device, string kernelName) : base(M) { this.device = device; base.PackMatrix(M); this.clmultiply = cl.CreateKernel(device.matrixProgram, kernelName); this.claccext = cl.CreateKernel(device.matrixProgram, "accumulateExternal"); disposed = false; LMAA(); if (extSize > 0) { extglobalsize = extSize; int m = extSize % extlocalsize; if (m > 0) { extglobalsize += extlocalsize - m; } h_ElementsToAcc = Marshal.AllocHGlobal(extSize * sizeof(double)); d_ElementsToAcc = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_READ_ONLY, (uint)extSize * sizeof(double)); d_IndicesToAccumulate = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_READ_ONLY, (uint)extSize * sizeof(int)); cl.EnqueueWriteBuffer(device.cq, d_IndicesToAccumulate, true, 0, (uint)extSize * sizeof(int), h_IndicesToAccumulate); } }
internal clCommVector(MatrixBase M, clVector v) : base(M, v) { this.owner = v; clfill = cl.CreateKernel(owner.device.vectorProgram, "fillSendBuffer"); IDictionary <int, int[]> comLists = M._SpmvCommPattern.ComLists; //int[] procranks = new int[comLists.Count]; // put all proccessor ranks in one list to have a unique ordering int totLen = 0; foreach (int procRnk in comLists.Keys) { int l = comLists[procRnk].Length; base.SendBuffersLengths[procRnk] = l; totLen += l; } size = totLen; globalsize = size; int m = size % localsize; if (m > 0) { globalsize += localsize - m; } if (size > 0) { // alloc h_IndicesToSend = new int[size]; d_IndicesToSend = cl.CreateBuffer(owner.device.env.context, cl_mem_flags.CL_MEM_READ_ONLY, (uint)size * sizeof(int)); h_SendBuffer = Marshal.AllocHGlobal(size * sizeof(double)); d_SendBuffer = cl.CreateBuffer(owner.device.env.context, cl_mem_flags.CL_MEM_WRITE_ONLY, (uint)size * sizeof(double)); // concat lists: int i0 = 0; unsafe { double *P0 = (double *)h_SendBuffer; foreach (int procRnk in comLists.Keys) { base.SendBuffers[procRnk] = (IntPtr)P0; // startaddres for sending to process 'procRnk' int l = base.SendBuffersLengths[procRnk]; P0 += l; Array.Copy(comLists[procRnk], 0, h_IndicesToSend, i0, l); // concat comm list i0 += l; } } cl.EnqueueWriteBuffer(owner.device.cq, d_IndicesToSend, true, 0, (uint)size * sizeof(int), h_IndicesToSend); } }
internal static extern ErrorCode clEnqueueNDRangeKernel( cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, [In][MarshalAs(UnmanagedType.LPArray)] IntPtr[] global_work_offset, [In][MarshalAs(UnmanagedType.LPArray)] IntPtr[] global_work_size, [In][MarshalAs(UnmanagedType.LPArray)] IntPtr[] local_work_size, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueNDRangeKernel( cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, IntPtr *global_work_offset, IntPtr *global_work_size, IntPtr *local_work_size, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event);
public static void Main() { const int cnBlockSize = 4; const int cnBlocks = 3; IntPtr cnDimension = new IntPtr(cnBlocks * cnBlockSize); string sProgramSource = @" __kernel void vectorAdd(__global const float * a, __global const float * b, __global float * c) { // Vector element index int nIndex = get_global_id(0); c[nIndex] = a[nIndex] + b[nIndex]; } "; ErrorCode error; // create OpenCL device & context cl_context hContext; unsafe { hContext = CL.CreateContextFromType((ContextProperties *)null, DeviceTypeFlags.DeviceTypeDefault, IntPtr.Zero, IntPtr.Zero, &error); } // query all devices available to the context IntPtr nContextDescriptorSize; CL.GetContextInfo(hContext, ContextInfo.ContextDevices, IntPtr.Zero, IntPtr.Zero, out nContextDescriptorSize); cl_device_id[] aDevices = new cl_device_id[nContextDescriptorSize.ToInt32()]; unsafe { fixed(cl_device_id *ptr = aDevices) { IntPtr ret; CL.GetContextInfo(hContext, ContextInfo.ContextDevices, nContextDescriptorSize, new IntPtr(ptr), out ret); } } // create a command queue for first device the context reported cl_command_queue hCmdQueue = CL.CreateCommandQueue(hContext, aDevices[0], (CommandQueueFlags)0, out error); // create & compile program cl_program hProgram; unsafe { hProgram = CL.CreateProgramWithSource(hContext, 1, new string[] { sProgramSource }, null, &error); } CL.BuildProgram(hProgram, 0, (IntPtr[])null, null, IntPtr.Zero, IntPtr.Zero); // create kernel cl_kernel hKernel = CL.CreateKernel(hProgram, "vectorAdd", out error); // allocate host vectors float[] A = new float[cnDimension.ToInt32()]; float[] B = new float[cnDimension.ToInt32()]; float[] C = new float[cnDimension.ToInt32()]; // initialize host memory Random rand = new Random(); for (int i = 0; i < A.Length; i++) { A[i] = rand.Next() % 256; B[i] = rand.Next() % 256; } // allocate device memory unsafe { fixed(float *pA = A) fixed(float *pB = B) fixed(float *pC = C) { cl_mem hDeviceMemA, hDeviceMemB, hDeviceMemC; hDeviceMemA = CL.CreateBuffer(hContext, MemFlags.MemReadOnly | MemFlags.MemCopyHostPtr, new IntPtr(cnDimension.ToInt32() * sizeof(float)), new IntPtr(pA), out error); hDeviceMemB = CL.CreateBuffer(hContext, MemFlags.MemReadOnly | MemFlags.MemCopyHostPtr, new IntPtr(cnDimension.ToInt32() * sizeof(float)), new IntPtr(pA), out error); hDeviceMemC = CL.CreateBuffer(hContext, MemFlags.MemWriteOnly, new IntPtr(cnDimension.ToInt32() * sizeof(float)), IntPtr.Zero, out error); // setup parameter values CL.SetKernelArg(hKernel, 0, new IntPtr(sizeof(cl_mem)), new IntPtr(&hDeviceMemA)); CL.SetKernelArg(hKernel, 1, new IntPtr(sizeof(cl_mem)), new IntPtr(&hDeviceMemB)); CL.SetKernelArg(hKernel, 2, new IntPtr(sizeof(cl_mem)), new IntPtr(&hDeviceMemC)); // write data from host to device CL.EnqueueWriteBuffer(hCmdQueue, hDeviceMemA, true, IntPtr.Zero, new IntPtr(cnDimension.ToInt32() * sizeof(float)), new IntPtr(pA), 0, null, (IntPtr[])null); CL.EnqueueWriteBuffer(hCmdQueue, hDeviceMemB, true, IntPtr.Zero, new IntPtr(cnDimension.ToInt32() * sizeof(float)), new IntPtr(pB), 0, null, (IntPtr[])null); // execute kernel error = (ErrorCode)CL.EnqueueNDRangeKernel(hCmdQueue, hKernel, 1, null, &cnDimension, null, 0, null, null); if (error != ErrorCode.Success) { throw new Exception(error.ToString()); } // copy results from device back to host IntPtr event_handle = IntPtr.Zero; error = (ErrorCode)CL.EnqueueReadBuffer(hCmdQueue, hDeviceMemC, true, IntPtr.Zero, new IntPtr(cnDimension.ToInt32() * sizeof(float)), new IntPtr(pC), 0, null, (IntPtr[])null); if (error != ErrorCode.Success) { throw new Exception(error.ToString()); } CL.Finish(hCmdQueue); CL.ReleaseMemObject(hDeviceMemA); CL.ReleaseMemObject(hDeviceMemB); CL.ReleaseMemObject(hDeviceMemC); } } for (int i = 0; i < A.Length; i++) { System.Diagnostics.Trace.WriteLine(String.Format("{0} + {1} = {2}", A[i], B[i], C[i])); } }
internal static extern ErrorCode clEnqueueTask( cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueTask( cl_command_queue command_queue, cl_kernel kernel, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event);
internal static extern ErrorCode clGetKernelWorkGroupInfo(cl_kernel kernel, cl_device_id device, cl_kernel_work_group_info param_name, IntPtr param_value_size, void *param_value, out IntPtr param_value_size_ret);
internal static extern ErrorCode clGetKernelInfo(cl_kernel kernel, cl_kernel_info param_name, IntPtr param_value_size, void *param_value, out IntPtr param_value_size_ret);
internal static extern ErrorCode clSetKernelArg(cl_kernel kernel, cl_uint arg_index, IntPtr arg_size, void *arg_value);
internal static extern ErrorCode clReleaseKernel(cl_kernel kernel);
internal static extern ErrorCode clRetainKernel(cl_kernel kernel);