/// <summary> /// ctor /// </summary> public CudaMatrix(MsrMatrix M, string funcName, CudaEnviroment CudaEnv) : base(M) { m_CudaEnv = CudaEnv; base.PackMatrix(M); cu.StreamCreate(out intStream, 0); cu.StreamCreate(out extStream, 0); disposed = false; sparseMultiply = CudaEnv.Get_CudaMatrixKernelDP_Function(funcName); cuaccext = CudaEnv.Get_CudaMatrixKernelDP_Function("accumulateExternal"); //int numreg; //cu.FuncGetAttribute(out numreg, CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, sparseMultiply); //int version; //cu.FuncGetAttribute(out version, CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION, sparseMultiply); //System.Console.WriteLine("Number of registers: " + numreg + ", version: " + version); LMAA(); if (extSize > 0) { // allocate page-locked mem cu.MemHostAlloc(out h_ElementsToAcc, sizeof(double) * (uint)extSize, CUmem_host_alloc.CU_MEMHOSTALLOC_DEVICEMAP); //test_ext = new double[totLen]; cu.MemHostGetDevicePointer(out d_ElementsToAcc, h_ElementsToAcc, 0); cu.MemAlloc(out d_IndicesToAccumulate, (uint)extSize * sizeof(int)); // Copy indices for combining external and internal part to GPU as they don't change over execution cu.MemcpyHtoD(d_IndicesToAccumulate, h_IndicesToAccumulate, (uint)extSize * sizeof(int)); } }
/// <summary> /// constructor which uses memory that is allocated elsewhere /// </summary> /// <param name="P"></param> /// <param name="content"> /// used to initialize <see cref="h_data"/> /// </param> /// <param name="env"></param> public CudaVector(IPartitioning P, double[] content, CudaEnviroment env) : base(P) { if (P.LocalLength > content.Length) { throw new ArgumentException("vector content must match local length of partition", "content"); } h_data = content; ConstructorCommon(env); }
/// <summary> /// Constructor /// </summary> /// <param name="M">Sparse matrix in MSR format</param> /// <param name="CudaEnv"></param> public CudaELLPACKmodMatrix(MsrMatrix M, CudaEnviroment CudaEnv) : base(M, "ellMultiply", CudaEnv) { m_internalData = (ELLPACKmod)m_LocalMtx; size = m_internalData.NoOfRows; colCount = m_internalData.NoOfPackedCols; valStride = m_internalData.MtxEntries.ColStride; colStride = m_internalData.ColInd.ColStride; blockcount = (int)Math.Ceiling((Decimal)size / blocksize); }
/// <summary> /// Constructor /// </summary> /// <param name="M">Sparse matrix in MSR format</param> /// <param name="CudaEnv"></param> public CudaELLPACKcacheMatrix(MsrMatrix M, CudaEnviroment CudaEnv) : base(M, "mcellMultiply", CudaEnv) { using (new FuncTrace()) { m_internalData = (ManualCacheELLPACK)m_LocalMtx; size = m_internalData.NoOfRows; colCount = m_internalData.NoOfPackedCols; valStride = m_internalData.MtxEntries.ColStride; colStride = m_internalData.ColIndBlock.ColStride; blockcount = (int)Math.Ceiling((Decimal)size / blocksize); } }
private void ConstructorCommon(CudaEnviroment env) { m_env = env; cuscale = m_env.Get_CudaVectorKernelDP_Function("scale"); cuacc = m_env.Get_CudaVectorKernelDP_Function("acc"); cudnrm2 = m_env.Get_CudaVectorKernelDP_Function("dnrm2"); cuinnerprod = m_env.Get_CudaVectorKernelDP_Function("innerprod"); cumew = m_env.Get_CudaVectorKernelDP_Function("mew"); size = this.Part.LocalLength; blockcountfull = (int)Math.Ceiling((decimal)size / blocksize); blockcounthalf = (int)Math.Ceiling((decimal)size / (2 * blocksize)); }
/// <summary> /// /// </summary> internal CudaCCBCSRMatrix(MsrMatrix M, CudaEnviroment CudaEnv) : base(M, "blockMultiply2", CudaEnv) { m_internalData = (CCBCSR)m_LocalMtx; rowcount = base.RowPartitioning.LocalLength; cellsize = m_internalData.CellSize; // Number of cells per block, choose so that it is around 128 threads per block cellrowsperblock = (int)Math.Ceiling(128.0 / cellsize); cellsperrow = m_internalData.NoOfCellsPerRow; stride = m_internalData.CellStride; // Number of threads per block blocksize = cellsize * cellrowsperblock; // Number of blocks blockcount = (int)Math.Ceiling((Decimal)rowcount / blocksize); }
/// <summary> /// ctor /// </summary> /// <param name="cuEnv">Distribution of processes and CUDA devices</param> public CudaDevice(CudaEnviroment cuEnv) { m_Env = cuEnv; }
/// <summary> /// constructor that allocates its own memory /// </summary> /// <param name="p">vector partition among MPI processes</param> /// <param name="env"></param> public CudaVector(IPartitioning p, CudaEnviroment env) : base(p) { h_data = new double[p.LocalLength]; ConstructorCommon(env); }
/// <summary> /// Constructor /// </summary> public CudaCSRMatrix(MsrMatrix M, CudaEnviroment CudaEnv) : base(M, "sparseMultiply", CudaEnv) { size = base.RowPartitioning.LocalLength; blockcount = (int)Math.Ceiling((decimal)size / blocksize); }