internal override void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc) { CUdeviceptr d_x = a.GetDevicePointer(); CUdeviceptr d_result = acc.GetDevicePointer(); int offset = 0; cu.ParamSetp(sparseMultiply, offset, d_val); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_colIdx); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_x); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_result); offset += sizeof(long); cu.ParamSetd(sparseMultiply, offset, alpha); offset += sizeof(double); cu.ParamSetd(sparseMultiply, offset, beta); offset += sizeof(double); cu.ParamSeti(sparseMultiply, offset, size); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, colCount); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, valStride); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, colStride); offset += sizeof(uint); cu.ParamSetSize(sparseMultiply, (uint)offset); cu.FuncSetBlockShape(sparseMultiply, blocksize, 1, 1); cu.LaunchGridAsync(sparseMultiply, blockcount, 1, stream); }
/// <summary> /// For each <em>j</em>, <br/> /// this[j] = this[j]*<paramref name="other"/>[j] /// </summary> /// <param name="other"></param> public override void MultiplyElementWise(VectorBase other) { if (!this.IsLocked || !other.IsLocked) { throw new ApplicationException("works only in locked mode"); } CudaVector _other = other as CudaVector; if (_other == null) { throw new ArgumentException("other must be of type CudaVector.", "other"); } if (_other.Part.LocalLength != this.Part.LocalLength) { throw new ArgumentException("mismatch in vector size."); } int offset = 0; cu.ParamSetp(cumew, offset, d_data); offset += sizeof(long); cu.ParamSetp(cumew, offset, _other.GetDevicePointer()); offset += sizeof(long); cu.ParamSeti(cumew, offset, size); offset += sizeof(uint); cu.ParamSetSize(cumew, (uint)offset); cu.FuncSetBlockShape(cumew, blocksize, 1, 1); cu.LaunchGrid(cumew, blockcountfull, 1); }
public override double InnerProd(VectorBase other) { if (!this.IsLocked || !other.IsLocked) { throw new ApplicationException("works only in locked mode"); } CudaVector _other = other as CudaVector; if (_other == null) { throw new ArgumentException("other must be of type CudaVector.", "other"); } if (_other.Part.LocalLength != this.Part.LocalLength) { throw new ArgumentException("mismatch in vector size."); } int offset = 0; double finalResult = 0.0; cu.ParamSetp(cuinnerprod, offset, d_data); offset += sizeof(long); cu.ParamSetp(cuinnerprod, offset, _other.GetDevicePointer()); offset += sizeof(long); cu.ParamSetp(cuinnerprod, offset, d_result); offset += sizeof(long); cu.ParamSeti(cuinnerprod, offset, size); offset += sizeof(uint); cu.ParamSetSize(cuinnerprod, (uint)offset); cu.FuncSetBlockShape(cuinnerprod, blocksize, 1, 1); cu.FuncSetSharedSize(cuinnerprod, (uint)(blocksize * sizeof(double))); cu.LaunchGrid(cuinnerprod, blockcounthalf, 1); cu.CtxSynchronize(); unsafe { double *ptr = (double *)h_result; for (int i = 0; i < blockcounthalf; i++) { finalResult += ptr[i]; } } double dotProdGlobal = double.NaN; unsafe { csMPI.Raw.Allreduce((IntPtr)(&finalResult), (IntPtr)(&dotProdGlobal), 1, csMPI.Raw._DATATYPE.DOUBLE, csMPI.Raw._OP.SUM, csMPI.Raw._COMM.WORLD); } return(dotProdGlobal); }
internal CudaCommVector(MatrixBase M, CudaVector v, CUstream stream) : base(M, v) { this.owner = v; this.stream = stream; cufill = owner.m_env.Get_CudaVectorKernelDP_Function("fillSendBuffer"); IDictionary <int, int[]> comLists = M._SpmvCommPattern.ComLists; //int[] procranks = new int[comLists.Count]; // put all proccessor ranks in one list to have a unique ordering int totLen = 0; foreach (int procRnk in comLists.Keys) { int l = comLists[procRnk].Length; base.SendBuffersLengths[procRnk] = l; totLen += l; } size = totLen; blockcount = (int)Math.Ceiling((decimal)size / blocksize); if (size > 0) { // alloc h_IndicesToSend = new int[size]; cu.MemAlloc(out d_IndicesToSend, (uint)size * sizeof(int)); cu.MemHostAlloc(out h_SendBuffer, sizeof(double) * (uint)size, CUmem_host_alloc.CU_MEMHOSTALLOC_DEVICEMAP); cu.MemHostGetDevicePointer(out d_SendBuffer, h_SendBuffer, 0); // concat lists: int i0 = 0; unsafe { double *P0 = (double *)h_SendBuffer; foreach (int procRnk in comLists.Keys) { base.SendBuffers[procRnk] = (IntPtr)P0; // startaddres for sending to process 'procRnk' int l = base.SendBuffersLengths[procRnk]; P0 += l; Array.Copy(comLists[procRnk], 0, h_IndicesToSend, i0, l); // concat comm list i0 += l; } } cu.MemcpyHtoD(d_IndicesToSend, h_IndicesToSend, (uint)size * sizeof(int)); } }
internal override void SpMV_External_Begin(double alpha, double beta, VectorBase acc) { m_alpha = alpha; CudaVector _acc = (CudaVector)acc; d_acc = _acc.GetDevicePointer(); unsafe { double *_acc_stor = (double *)h_ElementsToAcc; for (int i = (int)extSize - 1; i >= 0; i--) { *_acc_stor = 0; _acc_stor++; } } }
public override void CopyFrom(VectorBase other) { if (!this.IsLocked || !other.IsLocked) { throw new ApplicationException("works only in locked mode"); } CudaVector _other = other as CudaVector; if (_other == null) { throw new ArgumentException("other must be of type CudaVector.", "other"); } if (_other.Part.LocalLength != this.Part.LocalLength) { throw new ArgumentException("mismatch in vector size."); } cu.MemcpyDtoD(d_data, _other.GetDevicePointer(), (uint)(size * sizeof(double))); }
internal override void SpMV_Local_Start(double alpha, VectorBase a, double beta, VectorBase acc) { if (!m_IsLocked) { throw new ApplicationException("object must be locked."); } CudaVector _a = a as CudaVector; CudaVector _acc = acc as CudaVector; if (_a == null) { throw new ArgumentException("a must be of type CudaVector.", "a"); } if (_acc == null) { throw new ArgumentException("acc must be of type CudaVector.", "acc"); } CallDriver(intStream, alpha, _a, beta, _acc); }
internal override void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc) { CUdeviceptr d_x = a.GetDevicePointer(); CUdeviceptr d_result = acc.GetDevicePointer(); int offset = 0; cu.ParamSetp(sparseMultiply, offset, d_cellData); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_x); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_cellColIdx); offset += sizeof(long); cu.ParamSetp(sparseMultiply, offset, d_result); offset += sizeof(long); cu.ParamSetd(sparseMultiply, offset, alpha); offset += sizeof(double); cu.ParamSetd(sparseMultiply, offset, beta); offset += sizeof(double); cu.ParamSeti(sparseMultiply, offset, cellsize); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, cellrowsperblock); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, cellsperrow); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, stride); offset += sizeof(uint); cu.ParamSeti(sparseMultiply, offset, rowcount); offset += sizeof(uint); cu.ParamSetSize(sparseMultiply, (uint)offset); cu.FuncSetBlockShape(sparseMultiply, blocksize, 1, 1); cu.FuncSetSharedSize(sparseMultiply, (uint)(blocksize * sizeof(double) + 2 * cellrowsperblock * sizeof(int))); cu.LaunchGridAsync(sparseMultiply, blockcount, 1, stream); }
public override void Swap(VectorBase other) { if (!this.IsLocked || !other.IsLocked) { throw new ApplicationException("works only in locked mode"); } CudaVector _other = other as CudaVector; if (_other == null) { throw new ArgumentException("other must be of type CudaVector.", "other"); } if (_other.Part.LocalLength != this.Part.LocalLength) { throw new ArgumentException("mismatch in vector size."); } CUdeviceptr temp = _other.d_data; _other.d_data = this.d_data; this.d_data = temp; }
abstract internal void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc);