/// <summary> /// Create OpenCL vector with external memory /// </summary> /// <param name="p">Parition</param> /// <param name="content">Memory for this vector</param> /// <param name="device">Device</param> public clVector(IPartitioning p, double[] content, clDevice device) : base(p) { h_data = content; this.device = device; init(p, device.vectorProgram); }
/// <summary> /// Create OpenCL vector /// </summary> /// <param name="p">Parition</param> /// <param name="device">Device</param> public clVector(IPartitioning p, clDevice device) : base(p) { h_data = new double[p.LocalLength]; this.device = device; init(p, device.vectorProgram); }
/// <summary> /// Create matrix /// </summary> /// <param name="M">Original matrix</param> /// <param name="device">Corresponding OpenCL device</param> /// <param name="kernelName">Name of the kernel function</param> public clMatrix(MsrMatrix M, clDevice device, string kernelName) : base(M) { this.device = device; base.PackMatrix(M); this.clmultiply = cl.CreateKernel(device.matrixProgram, kernelName); this.claccext = cl.CreateKernel(device.matrixProgram, "accumulateExternal"); disposed = false; LMAA(); if (extSize > 0) { extglobalsize = extSize; int m = extSize % extlocalsize; if (m > 0) { extglobalsize += extlocalsize - m; } h_ElementsToAcc = Marshal.AllocHGlobal(extSize * sizeof(double)); d_ElementsToAcc = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_READ_ONLY, (uint)extSize * sizeof(double)); d_IndicesToAccumulate = cl.CreateBuffer(device.env.context, cl_mem_flags.CL_MEM_READ_ONLY, (uint)extSize * sizeof(int)); cl.EnqueueWriteBuffer(device.cq, d_IndicesToAccumulate, true, 0, (uint)extSize * sizeof(int), h_IndicesToAccumulate); } }
public clCSRMatrix(MsrMatrix M, clDevice device) : base(M, device, "csrMultiply") { size = base.RowPartitioning.LocalLength; localsize = 256; globalsize = size; int m = size % localsize; if (m > 0) { globalsize += localsize - m; } }
static Device GetOrCereateDevice(DeviceType DevType) { Device Dev = null; m_DeviceS.TryGetValue(DevType, out Dev); if (Dev == null) { switch (DevType) { case DeviceType.Cuda: Dev = new CUDA.CudaDevice(new CUDA.CudaEnviroment(Environment.MPIEnv)); break; case DeviceType.OpenCL: Dev = new CL.clDevice(new CL.clEnvironment(Environment.MPIEnv)); break; case DeviceType.CPU: Dev = new CPU.ReferenceDevice(); break; case DeviceType.MultiThreadCPU: Dev = new mtCPU.MtDevice(); break; case DeviceType.Auto: { // try cuda at frist: try { Dev = GetOrCereateDevice(DeviceType.Cuda); } catch (Exception) { Dev = null; } if (Dev != null) { break; } // try OpenCL next: //try { // Dev = GetOrCereateDevice(DeviceType.OpenCL); //} catch (Exception) { // Dev = null; //} //if (Dev != null) break; // fall back to CPU: Dev = GetOrCereateDevice(DeviceType.CPU); break; } default: throw new NotImplementedException("monkey device type: " + DevType.ToString() + " missing in factory."); } m_DeviceS.Add(DevType, Dev); } return(Dev); }
public clELLPACKmodMatrix(MsrMatrix M, clDevice device) : base(M, device, "ellMultiply") { m_internalData = (ELLPACKmod)m_LocalMtx; size = m_internalData.NoOfRows; colCount = m_internalData.NoOfPackedCols; valStride = m_internalData.MtxEntries.ColStride; colStride = m_internalData.ColInd.ColStride; localsize = 256; globalsize = size; int m = size % localsize; if (m > 0) { globalsize += localsize - m; } }
public clCCBCSRMatrix(MsrMatrix M, clDevice device) : base(M, device, "ccbcsrMultiply") { m_internalData = (CCBCSR)m_LocalMtx; size = base.RowPartitioning.LocalLength; cellsize = m_internalData.CellSize; // Number of cells per block, choose so that it is around 256 threads per block cellrowsperblock = (int)Math.Ceiling(128.0 / cellsize); cellsperrow = m_internalData.NoOfCellsPerRow; stride = m_internalData.CellStride; // Number of threads per block localsize = cellsize * cellrowsperblock; globalsize = size; int m = size % localsize; if (m > 0) { globalsize += localsize - m; } }