/// <summary> /// /// </summary> internal CudaCCBCSRMatrix(MsrMatrix M, CudaEnviroment CudaEnv) : base(M, "blockMultiply2", CudaEnv) { m_internalData = (CCBCSR)m_LocalMtx; rowcount = base.RowPartitioning.LocalLength; cellsize = m_internalData.CellSize; // Number of cells per block, choose so that it is around 128 threads per block cellrowsperblock = (int)Math.Ceiling(128.0 / cellsize); cellsperrow = m_internalData.NoOfCellsPerRow; stride = m_internalData.CellStride; // Number of threads per block blocksize = cellsize * cellrowsperblock; // Number of blocks blockcount = (int)Math.Ceiling((Decimal)rowcount / blocksize); }
public clCCBCSRMatrix(MsrMatrix M, clDevice device) : base(M, device, "ccbcsrMultiply") { m_internalData = (CCBCSR)m_LocalMtx; size = base.RowPartitioning.LocalLength; cellsize = m_internalData.CellSize; // Number of cells per block, choose so that it is around 256 threads per block cellrowsperblock = (int)Math.Ceiling(128.0 / cellsize); cellsperrow = m_internalData.NoOfCellsPerRow; stride = m_internalData.CellStride; // Number of threads per block localsize = cellsize * cellrowsperblock; globalsize = size; int m = size % localsize; if (m > 0) { globalsize += localsize - m; } }