private void CreateDataStructures() { int[] h_xSubStart = new int[m_internalData.BlockSubVectorLength.Length + 1]; int len = 0; for (int i = 0; i < m_internalData.BlockSubVectorLength.Length; i++) { h_xSubStart[i] = len; len += m_internalData.BlockSubVectorLength[i]; if (m_internalData.BlockSubVectorLength[i] > maxBlockSubVectorLength) { maxBlockSubVectorLength = m_internalData.BlockSubVectorLength[i]; } } h_xSubStart[m_internalData.BlockSubVectorLength.Length] = len; int[] h_blockSubVector = new int[len]; int idx = 0; for (int i = 0; i < m_internalData.BlockSubVectorLength.Length; i++) { for (int j = 0; j < m_internalData.BlockSubVectorLength[i]; j++) { h_blockSubVector[idx] = m_internalData.BlockSubVector[i, j]; idx++; } } d_xSubStart = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)(h_xSubStart.Length + 1) * sizeof(int), h_xSubStart); d_blockSubVector = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)h_blockSubVector.Length * sizeof(int), h_blockSubVector); }
/// <summary> /// Create matrix /// </summary> /// <param name="M">Original matrix</param> /// <param name="device">Corresponding OpenCL device</param> /// <param name="kernelName">Name of the kernel function</param> public clMatrix(MsrMatrix M, clDevice device, string kernelName) : base(M) { this.device = device; base.PackMatrix(M); this.clmultiply = cl.CreateKernel(device.matrixProgram, kernelName); this.claccext = cl.CreateKernel(device.matrixProgram, "accumulateExternal"); disposed = false; LMAA(); if (extSize > 0) { extglobalsize = extSize; int m = extSize % extlocalsize; if (m > 0) { extglobalsize += extlocalsize - m; } h_ElementsToAcc = Marshal.AllocHGlobal(extSize * sizeof(double)); d_ElementsToAcc = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_READ_ONLY, (uint)extSize * sizeof(double)); d_IndicesToAccumulate = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_READ_ONLY, (uint)extSize * sizeof(int)); cl.EnqueueWriteBuffer(device.cq, d_IndicesToAccumulate, true, 0, (uint)extSize * sizeof(int), h_IndicesToAccumulate); } }
internal static extern ErrorCode clEnqueueUnmapMemObject( cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event);
/// <summary> /// Allocate memory and copy matrix data on GPU /// </summary> public override void Lock() { base.Lock(); d_cellData = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)m_internalData.Val.Length * sizeof(double), m_internalData.Val); d_cellColIdx = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)m_internalData.CellColumn.Length * sizeof(double), m_internalData.CellColumn); }
/// <summary> /// Allocate memory and copy matrix data on GPU /// </summary> public override void Lock() { base.Lock(); d_val = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)m_internalData.MtxEntries.Values.Length * sizeof(double), m_internalData.MtxEntries.Values); d_colIdx = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)m_internalData.ColInd.Values.Length * sizeof(int), m_internalData.ColInd.Values); }
internal static extern ErrorCode clEnqueueUnmapMemObject( cl_command_queue command_queue, cl_mem memobj, void *mapped_ptr, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event);
/// <summary> /// Copy vector to device /// </summary> public override void Lock() { base.Lock(); h_result = Marshal.AllocHGlobal(groups * sizeof(double)); d_data = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR, (uint)h_data.Length * sizeof(double), h_data); d_result = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_USE_HOST_PTR | cl_mem_flags.CL_MEM_WRITE_ONLY, (uint)groups * sizeof(double), h_result); }
internal clCommVector(MatrixBase M, clVector v) : base(M, v) { this.owner = v; clfill = cl.CreateKernel(owner.device.vectorProgram, "fillSendBuffer"); IDictionary <int, int[]> comLists = M._SpmvCommPattern.ComLists; //int[] procranks = new int[comLists.Count]; // put all proccessor ranks in one list to have a unique ordering int totLen = 0; foreach (int procRnk in comLists.Keys) { int l = comLists[procRnk].Length; base.SendBuffersLengths[procRnk] = l; totLen += l; } size = totLen; globalsize = size; int m = size % localsize; if (m > 0) { globalsize += localsize - m; } if (size > 0) { // alloc h_IndicesToSend = new int[size]; d_IndicesToSend = cl.CreateBuffer(owner.device.env.context, cl_mem_flags.CL_MEM_READ_ONLY, (uint)size * sizeof(int)); h_SendBuffer = Marshal.AllocHGlobal(size * sizeof(double)); d_SendBuffer = cl.CreateBuffer(owner.device.env.context, cl_mem_flags.CL_MEM_WRITE_ONLY, (uint)size * sizeof(double)); // concat lists: int i0 = 0; unsafe { double *P0 = (double *)h_SendBuffer; foreach (int procRnk in comLists.Keys) { base.SendBuffers[procRnk] = (IntPtr)P0; // startaddres for sending to process 'procRnk' int l = base.SendBuffersLengths[procRnk]; P0 += l; Array.Copy(comLists[procRnk], 0, h_IndicesToSend, i0, l); // concat comm list i0 += l; } } cl.EnqueueWriteBuffer(owner.device.cq, d_IndicesToSend, true, 0, (uint)size * sizeof(int), h_IndicesToSend); } }
internal static extern ErrorCode clEnqueueCopyBufferToImage( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, IntPtr src_offset, IntPtr *dst_origin, IntPtr *region, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueCopyBufferToImage( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_image, IntPtr src_offset, [In][MarshalAs(UnmanagedType.LPArray, SizeConst = 3)] IntPtr[] dst_origin, [In][MarshalAs(UnmanagedType.LPArray, SizeConst = 3)] IntPtr[] region, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueCopyBuffer( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, IntPtr src_offset, IntPtr dst_offset, IntPtr cb, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueCopyBuffer( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, IntPtr src_offset, IntPtr dst_offset, IntPtr cb, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueWriteBuffer( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, IntPtr offset, IntPtr cb, void *ptr, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueWriteBuffer( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, IntPtr offset, IntPtr cb, void *ptr, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event);
/// <summary> /// Allocate memory and copy matrix data on GPU /// </summary> public override void Lock() { base.Lock(); MatrixBase.CSR LocalMatrix = (MatrixBase.CSR)base.m_LocalMtx; d_val = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)LocalMatrix.Val.Length * sizeof(double), LocalMatrix.Val); d_rowStart = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)LocalMatrix.RowStart.Length * sizeof(double), LocalMatrix.RowStart); d_colIdx = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_COPY_HOST_PTR | cl_mem_flags.CL_MEM_READ_ONLY, (uint)LocalMatrix.ColInd.Length * sizeof(double), LocalMatrix.ColInd); }
internal static extern void *clEnqueueMapBuffer( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, IntPtr offset, IntPtr cb, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event, out ErrorCode errcode_ret);
internal static extern void *clEnqueueMapBuffer( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_map, cl_map_flags map_flags, IntPtr offset, IntPtr cb, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event, out ErrorCode errcode_ret);
internal static extern ErrorCode clEnqueueWriteImage( cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, IntPtr *origin, IntPtr *region, IntPtr input_row_pitch, IntPtr input_slice_pitch, void *ptr, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueWriteImage( cl_command_queue command_queue, cl_mem image, cl_bool blocking_write, [In][MarshalAs(UnmanagedType.LPArray, SizeConst = 3)] IntPtr[] origin, [In][MarshalAs(UnmanagedType.LPArray, SizeConst = 3)] IntPtr[] region, IntPtr input_row_pitch, IntPtr input_slice_pitch, void *ptr, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event);
internal static extern void *clEnqueueMapImage( cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, cl_map_flags map_flags, IntPtr *origin, IntPtr *region, out IntPtr image_row_pitch, out IntPtr image_slice_pitch, cl_uint num_events_in_wait_list, IntPtr *event_wait_list, cl_event *_event, out ErrorCode errcode_ret);
internal static extern void *clEnqueueMapImage( cl_command_queue command_queue, cl_mem image, cl_bool blocking_map, cl_map_flags map_flags, [In][MarshalAs(UnmanagedType.LPArray, SizeConst = 3)] IntPtr[] origin, [In][MarshalAs(UnmanagedType.LPArray, SizeConst = 3)] IntPtr[] region, out IntPtr image_row_pitch, out IntPtr image_slice_pitch, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] event_wait_list, cl_event *_event, out ErrorCode errcode_ret);
internal static extern ErrorCode clEnqueueCopyBufferRect( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, IntPtr *src_origin, IntPtr *dst_origin, IntPtr *region, IntPtr src_row_pitch, IntPtr src_slice_pitch, IntPtr dst_row_pitch, IntPtr dst_slice_pitch, cl_uint num_events_in_wait_list, cl_event *_event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueCopyBufferRect( cl_command_queue command_queue, cl_mem src_buffer, cl_mem dst_buffer, [In] IntPtr[] src_origin, [In] IntPtr[] dst_origin, [In] IntPtr[] region, IntPtr src_row_pitch, IntPtr src_slice_pitch, IntPtr dst_row_pitch, IntPtr dst_slice_pitch, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] _event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueWriteBufferRect( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, IntPtr *buffer_offset, IntPtr *host_offset, IntPtr *region, IntPtr buffer_row_pitch, IntPtr buffer_slice_pitch, IntPtr host_row_pitch, IntPtr host_slice_pitch, void *ptr, cl_uint num_events_in_wait_list, cl_event *_event_wait_list, cl_event *_event);
internal static extern ErrorCode clEnqueueWriteBufferRect( cl_command_queue command_queue, cl_mem buffer, cl_bool blocking_write, [In] IntPtr[] buffer_offset, [In] IntPtr[] host_offset, [In] IntPtr[] region, IntPtr buffer_row_pitch, IntPtr buffer_slice_pitch, IntPtr host_row_pitch, IntPtr host_slice_pitch, void *ptr, cl_uint num_events_in_wait_list, [In][MarshalAs(UnmanagedType.LPArray)] cl_event[] _event_wait_list, cl_event *_event);
internal override void SetArguments(double alpha, clVector a, double beta, clVector acc) { cl_mem d_x = a.GetDevicePointer(); cl_mem d_result = acc.GetDevicePointer(); cl.SetKernelArg(clmultiply, 0, d_val); cl.SetKernelArg(clmultiply, 1, d_colIdx); cl.SetKernelArg(clmultiply, 2, d_rowStart); cl.SetKernelArg(clmultiply, 3, d_result); cl.SetKernelArg(clmultiply, 4, d_x); cl.SetKernelArgLocalSize(clmultiply, 5, (uint)(localsize + 1) * sizeof(int)); cl.SetKernelArg(clmultiply, 6, alpha); cl.SetKernelArg(clmultiply, 7, beta); cl.SetKernelArg(clmultiply, 8, size); }
internal override void SetArguments(double alpha, clVector a, double beta, clVector acc) { cl_mem d_x = a.GetDevicePointer(); cl_mem d_result = acc.GetDevicePointer(); cl.SetKernelArg(clmultiply, 0, d_val); cl.SetKernelArg(clmultiply, 1, d_colIdx); cl.SetKernelArg(clmultiply, 2, d_x); cl.SetKernelArg(clmultiply, 3, d_result); cl.SetKernelArg(clmultiply, 4, alpha); cl.SetKernelArg(clmultiply, 5, beta); cl.SetKernelArg(clmultiply, 6, size); cl.SetKernelArg(clmultiply, 7, colCount); cl.SetKernelArg(clmultiply, 8, valStride); cl.SetKernelArg(clmultiply, 9, colStride); }
internal override void SpMV_External_Begin(double alpha, double beta, VectorBase acc) { m_alpha = alpha; clVector _acc = (clVector)acc; d_acc = _acc.GetDevicePointer(); unsafe { double *_acc_stor = (double *)h_ElementsToAcc; for (int i = (int)extSize - 1; i >= 0; i--) { *_acc_stor = 0; _acc_stor++; } } }
internal override void SetArguments(double alpha, clVector a, double beta, clVector acc) { cl_mem d_x = a.GetDevicePointer(); cl_mem d_result = acc.GetDevicePointer(); cl.SetKernelArg(clmultiply, 0, d_val); cl.SetKernelArg(clmultiply, 1, d_colIdx); cl.SetKernelArg(clmultiply, 2, d_xSubStart); cl.SetKernelArg(clmultiply, 3, d_blockSubVector); cl.SetKernelArg(clmultiply, 4, d_x); cl.SetKernelArg(clmultiply, 5, d_result); cl.SetKernelArgLocalSize(clmultiply, 6, (uint)maxBlockSubVectorLength * sizeof(double)); cl.SetKernelArg(clmultiply, 7, alpha); cl.SetKernelArg(clmultiply, 8, beta); cl.SetKernelArg(clmultiply, 9, size); cl.SetKernelArg(clmultiply, 10, colCount); cl.SetKernelArg(clmultiply, 11, valStride); cl.SetKernelArg(clmultiply, 12, colStride); }
internal override void SetArguments(double alpha, clVector a, double beta, clVector acc) { cl_mem d_x = a.GetDevicePointer(); cl_mem d_result = acc.GetDevicePointer(); cl.SetKernelArg(clmultiply, 0, d_cellData); cl.SetKernelArg(clmultiply, 1, d_x); cl.SetKernelArg(clmultiply, 2, d_cellColIdx); cl.SetKernelArg(clmultiply, 3, d_result); cl.SetKernelArgLocalSize(clmultiply, 4, (uint)(localsize * sizeof(double))); cl.SetKernelArgLocalSize(clmultiply, 5, (uint)(cellrowsperblock * sizeof(int))); cl.SetKernelArgLocalSize(clmultiply, 6, (uint)(cellrowsperblock * sizeof(int))); cl.SetKernelArg(clmultiply, 7, alpha); cl.SetKernelArg(clmultiply, 8, beta); cl.SetKernelArg(clmultiply, 9, cellsize); cl.SetKernelArg(clmultiply, 10, cellrowsperblock); cl.SetKernelArg(clmultiply, 11, cellsperrow); cl.SetKernelArg(clmultiply, 12, stride); cl.SetKernelArg(clmultiply, 13, size); }