Ejemplo n.º 1
0
        /// <summary>
        /// ctor
        /// </summary>
        public CudaMatrix(MsrMatrix M, string funcName, CudaEnviroment CudaEnv)
            : base(M)
        {
            m_CudaEnv = CudaEnv;
            base.PackMatrix(M);

            cu.StreamCreate(out intStream, 0);
            cu.StreamCreate(out extStream, 0);
            disposed = false;

            sparseMultiply = CudaEnv.Get_CudaMatrixKernelDP_Function(funcName);
            cuaccext       = CudaEnv.Get_CudaMatrixKernelDP_Function("accumulateExternal");

            //int numreg;
            //cu.FuncGetAttribute(out numreg, CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS, sparseMultiply);
            //int version;
            //cu.FuncGetAttribute(out version, CUfunction_attribute.CU_FUNC_ATTRIBUTE_BINARY_VERSION, sparseMultiply);
            //System.Console.WriteLine("Number of registers: " + numreg + ", version: " + version);

            LMAA();

            if (extSize > 0)
            {
                // allocate page-locked mem
                cu.MemHostAlloc(out h_ElementsToAcc, sizeof(double) * (uint)extSize, CUmem_host_alloc.CU_MEMHOSTALLOC_DEVICEMAP);
                //test_ext = new double[totLen];
                cu.MemHostGetDevicePointer(out d_ElementsToAcc, h_ElementsToAcc, 0);

                cu.MemAlloc(out d_IndicesToAccumulate, (uint)extSize * sizeof(int));

                // Copy indices for combining external and internal part to GPU as they don't change over execution
                cu.MemcpyHtoD(d_IndicesToAccumulate, h_IndicesToAccumulate, (uint)extSize * sizeof(int));
            }
        }
Ejemplo n.º 2
0
 /// <summary>
 /// constructor which uses memory that is allocated elsewhere
 /// </summary>
 /// <param name="P"></param>
 /// <param name="content">
 /// used to initialize <see cref="h_data"/>
 /// </param>
 /// <param name="env"></param>
 public CudaVector(IPartitioning P, double[] content, CudaEnviroment env) : base(P)
 {
     if (P.LocalLength > content.Length)
     {
         throw new ArgumentException("vector content must match local length of partition", "content");
     }
     h_data = content;
     ConstructorCommon(env);
 }
Ejemplo n.º 3
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="M">Sparse matrix in MSR format</param>
 /// <param name="CudaEnv"></param>
 public CudaELLPACKmodMatrix(MsrMatrix M, CudaEnviroment CudaEnv)
     : base(M, "ellMultiply", CudaEnv)
 {
     m_internalData = (ELLPACKmod)m_LocalMtx;
     size           = m_internalData.NoOfRows;
     colCount       = m_internalData.NoOfPackedCols;
     valStride      = m_internalData.MtxEntries.ColStride;
     colStride      = m_internalData.ColInd.ColStride;
     blockcount     = (int)Math.Ceiling((Decimal)size / blocksize);
 }
Ejemplo n.º 4
0
 /// <summary>
 /// Constructor
 /// </summary>
 /// <param name="M">Sparse matrix in MSR format</param>
 /// <param name="CudaEnv"></param>
 public CudaELLPACKcacheMatrix(MsrMatrix M, CudaEnviroment CudaEnv)
     : base(M, "mcellMultiply", CudaEnv)
 {
     using (new FuncTrace()) {
         m_internalData = (ManualCacheELLPACK)m_LocalMtx;
         size           = m_internalData.NoOfRows;
         colCount       = m_internalData.NoOfPackedCols;
         valStride      = m_internalData.MtxEntries.ColStride;
         colStride      = m_internalData.ColIndBlock.ColStride;
         blockcount     = (int)Math.Ceiling((Decimal)size / blocksize);
     }
 }
Ejemplo n.º 5
0
        private void ConstructorCommon(CudaEnviroment env)
        {
            m_env       = env;
            cuscale     = m_env.Get_CudaVectorKernelDP_Function("scale");
            cuacc       = m_env.Get_CudaVectorKernelDP_Function("acc");
            cudnrm2     = m_env.Get_CudaVectorKernelDP_Function("dnrm2");
            cuinnerprod = m_env.Get_CudaVectorKernelDP_Function("innerprod");
            cumew       = m_env.Get_CudaVectorKernelDP_Function("mew");

            size           = this.Part.LocalLength;
            blockcountfull = (int)Math.Ceiling((decimal)size / blocksize);
            blockcounthalf = (int)Math.Ceiling((decimal)size / (2 * blocksize));
        }
Ejemplo n.º 6
0
        /// <summary>
        ///
        /// </summary>
        internal CudaCCBCSRMatrix(MsrMatrix M, CudaEnviroment CudaEnv)
            : base(M, "blockMultiply2", CudaEnv)
        {
            m_internalData = (CCBCSR)m_LocalMtx;

            rowcount = base.RowPartitioning.LocalLength;
            cellsize = m_internalData.CellSize;
            // Number of cells per block, choose so that it is around 128 threads per block
            cellrowsperblock = (int)Math.Ceiling(128.0 / cellsize);
            cellsperrow      = m_internalData.NoOfCellsPerRow;
            stride           = m_internalData.CellStride;
            // Number of threads per block
            blocksize = cellsize * cellrowsperblock;
            // Number of blocks
            blockcount = (int)Math.Ceiling((Decimal)rowcount / blocksize);
        }
Ejemplo n.º 7
0
 /// <summary>
 /// ctor
 /// </summary>
 /// <param name="cuEnv">Distribution of processes and CUDA devices</param>
 public CudaDevice(CudaEnviroment cuEnv)
 {
     m_Env = cuEnv;
 }
Ejemplo n.º 8
0
 /// <summary>
 /// constructor that allocates its own memory
 /// </summary>
 /// <param name="p">vector partition among MPI processes</param>
 /// <param name="env"></param>
 public CudaVector(IPartitioning p, CudaEnviroment env)
     : base(p)
 {
     h_data = new double[p.LocalLength];
     ConstructorCommon(env);
 }
Ejemplo n.º 9
0
 /// <summary>
 /// Constructor
 /// </summary>
 public CudaCSRMatrix(MsrMatrix M, CudaEnviroment CudaEnv)
     : base(M, "sparseMultiply", CudaEnv)
 {
     size       = base.RowPartitioning.LocalLength;
     blockcount = (int)Math.Ceiling((decimal)size / blocksize);
 }