internal override void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc) { CUdeviceptr d_x = a.GetDevicePointer(); CUdeviceptr d_result = acc.GetDevicePointer(); int offset = 0; cu.ParamSetp(sparseMultiply, offset, d_val); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_colIdx); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_x); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_result); offset += sizeof(long); cu.ParamSetd(sparseMultiply, offset, alpha); offset += sizeof(double); cu.ParamSetd(sparseMultiply, offset, beta); offset += sizeof(double); cu.ParamSeti(sparseMultiply, offset, size); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, colCount); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, valStride); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, colStride); offset += sizeof(uint); cu.ParamSetSize(sparseMultiply, (uint)offset); cu.FuncSetBlockShape(sparseMultiply, blocksize, 1, 1); cu.LaunchGridAsync(sparseMultiply, blockcount, 1, stream); }
internal CudaCommVector(MatrixBase M, CudaVector v, CUstream stream) : base(M, v) { this.owner = v; this.stream = stream; cufill = owner.m_env.Get_CudaVectorKernelDP_Function("fillSendBuffer"); IDictionary <int, int[]> comLists = M._SpmvCommPattern.ComLists; //int[] procranks = new int[comLists.Count]; // put all proccessor ranks in one list to have a unique ordering int totLen = 0; foreach (int procRnk in comLists.Keys) { int l = comLists[procRnk].Length; base.SendBuffersLengths[procRnk] = l; totLen += l; } size = totLen; blockcount = (int)Math.Ceiling((decimal)size / blocksize); if (size > 0) { // alloc h_IndicesToSend = new int[size]; cu.MemAlloc(out d_IndicesToSend, (uint)size * sizeof(int)); cu.MemHostAlloc(out h_SendBuffer, sizeof(double) * (uint)size, CUmem_host_alloc.CU_MEMHOSTALLOC_DEVICEMAP); cu.MemHostGetDevicePointer(out d_SendBuffer, h_SendBuffer, 0); // concat lists: int i0 = 0; unsafe { double *P0 = (double *)h_SendBuffer; foreach (int procRnk in comLists.Keys) { base.SendBuffers[procRnk] = (IntPtr)P0; // startaddres for sending to process 'procRnk' int l = base.SendBuffersLengths[procRnk]; P0 += l; Array.Copy(comLists[procRnk], 0, h_IndicesToSend, i0, l); // concat comm list i0 += l; } } cu.MemcpyHtoD(d_IndicesToSend, h_IndicesToSend, (uint)size * sizeof(int)); } }
/// <summary> /// disp /// </summary> public override void Dispose() { base.Dispose(); if (disposed) { return; } cu.MemFreeHost(h_ElementsToAcc); h_ElementsToAcc = IntPtr.Zero; cu.MemFree(d_IndicesToAccumulate); cu.StreamDestroy(intStream); intStream = default(CUstream); cu.StreamDestroy(extStream); extStream = default(CUstream); disposed = true; }
internal override void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc) { CUdeviceptr d_x = a.GetDevicePointer(); CUdeviceptr d_result = acc.GetDevicePointer(); int offset = 0; cu.ParamSetp(sparseMultiply, offset, d_cellData); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_x); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_cellColIdx); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_result); offset += sizeof(long); cu.ParamSetd(sparseMultiply, offset, alpha); offset += sizeof(double); cu.ParamSetd(sparseMultiply, offset, beta); offset += sizeof(double); cu.ParamSeti(sparseMultiply, offset, cellsize); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, cellrowsperblock); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, cellsperrow); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, stride); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, rowcount); offset += sizeof(uint); cu.ParamSetSize(sparseMultiply, (uint)offset); cu.FuncSetBlockShape(sparseMultiply, blocksize, 1, 1); cu.FuncSetSharedSize(sparseMultiply, (uint)(blocksize * sizeof(double) + 2 * cellrowsperblock * sizeof(int))); cu.LaunchGridAsync(sparseMultiply, blockcount, 1, stream); }
/// <summary> see CUDA doc; </summary> public static void StreamDestroy(CUstream hStream) { testResult(my.cuStreamDestroy(hStream)); }
/// <summary> see CUDA doc; </summary> public static void StreamSynchronize(CUstream hStream) { testResult(my.cuStreamSynchronize(hStream)); }
/// <summary> see CUDA doc; </summary> public static CUresult StreamQuery(CUstream hStream) { return(my.cuStreamQuery(hStream)); }
/// <summary> see CUDA doc; </summary> public static void StreamCreate(out CUstream hStream, uint Flags) { testResult(my.cuStreamCreate(out hStream, Flags)); }
/// <summary> see CUDA doc; </summary> static public void LaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream) { testResult(my.cuLaunchGridAsync(f, grid_width, grid_height, hStream)); }
abstract internal void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc);