예제 #1
0
        internal override void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc)
        {
            CUdeviceptr d_x      = a.GetDevicePointer();
            CUdeviceptr d_result = acc.GetDevicePointer();

            int offset = 0;

            cu.ParamSetp(sparseMultiply, offset, d_val);
            offset += sizeof(long);
            cu.ParamSetp(sparseMultiply, offset, d_colIdx);
            offset += sizeof(long);
            cu.ParamSetp(sparseMultiply, offset, d_x);
            offset += sizeof(long);
            cu.ParamSetp(sparseMultiply, offset, d_result);
            offset += sizeof(long);
            cu.ParamSetd(sparseMultiply, offset, alpha);
            offset += sizeof(double);
            cu.ParamSetd(sparseMultiply, offset, beta);
            offset += sizeof(double);
            cu.ParamSeti(sparseMultiply, offset, size);
            offset += sizeof(uint);
            cu.ParamSeti(sparseMultiply, offset, colCount);
            offset += sizeof(uint);
            cu.ParamSeti(sparseMultiply, offset, valStride);
            offset += sizeof(uint);
            cu.ParamSeti(sparseMultiply, offset, colStride);
            offset += sizeof(uint);

            cu.ParamSetSize(sparseMultiply, (uint)offset);
            cu.FuncSetBlockShape(sparseMultiply, blocksize, 1, 1);

            cu.LaunchGridAsync(sparseMultiply, blockcount, 1, stream);
        }
예제 #2
0
            internal CudaCommVector(MatrixBase M, CudaVector v, CUstream stream)
                : base(M, v)
            {
                this.owner  = v;
                this.stream = stream;
                cufill      = owner.m_env.Get_CudaVectorKernelDP_Function("fillSendBuffer");

                IDictionary <int, int[]> comLists = M._SpmvCommPattern.ComLists;
                //int[] procranks = new int[comLists.Count]; // put all proccessor ranks in one list to have a unique ordering

                int totLen = 0;

                foreach (int procRnk in comLists.Keys)
                {
                    int l = comLists[procRnk].Length;
                    base.SendBuffersLengths[procRnk] = l;
                    totLen += l;
                }

                size       = totLen;
                blockcount = (int)Math.Ceiling((decimal)size / blocksize);
                if (size > 0)
                {
                    // alloc
                    h_IndicesToSend = new int[size];
                    cu.MemAlloc(out d_IndicesToSend, (uint)size * sizeof(int));

                    cu.MemHostAlloc(out h_SendBuffer, sizeof(double) * (uint)size, CUmem_host_alloc.CU_MEMHOSTALLOC_DEVICEMAP);
                    cu.MemHostGetDevicePointer(out d_SendBuffer, h_SendBuffer, 0);

                    // concat lists:
                    int i0 = 0;
                    unsafe {
                        double *P0 = (double *)h_SendBuffer;

                        foreach (int procRnk in comLists.Keys)
                        {
                            base.SendBuffers[procRnk] = (IntPtr)P0;  // startaddres for sending to process 'procRnk'

                            int l = base.SendBuffersLengths[procRnk];
                            P0 += l;
                            Array.Copy(comLists[procRnk], 0, h_IndicesToSend, i0, l); // concat comm list
                            i0 += l;
                        }
                    }

                    cu.MemcpyHtoD(d_IndicesToSend, h_IndicesToSend, (uint)size * sizeof(int));
                }
            }
예제 #3
0
        /// <summary>
        /// disp
        /// </summary>
        public override void Dispose()
        {
            base.Dispose();

            if (disposed)
            {
                return;
            }

            cu.MemFreeHost(h_ElementsToAcc);
            h_ElementsToAcc = IntPtr.Zero;
            cu.MemFree(d_IndicesToAccumulate);

            cu.StreamDestroy(intStream);
            intStream = default(CUstream);
            cu.StreamDestroy(extStream);
            extStream = default(CUstream);

            disposed = true;
        }
예제 #4
0
        internal override void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc)
        {
            CUdeviceptr d_x      = a.GetDevicePointer();
            CUdeviceptr d_result = acc.GetDevicePointer();

            int offset = 0;

            cu.ParamSetp(sparseMultiply, offset, d_cellData);
            offset += sizeof(long);
            cu.ParamSetp(sparseMultiply, offset, d_x);
            offset += sizeof(long);
            cu.ParamSetp(sparseMultiply, offset, d_cellColIdx);
            offset += sizeof(long);
            cu.ParamSetp(sparseMultiply, offset, d_result);
            offset += sizeof(long);
            cu.ParamSetd(sparseMultiply, offset, alpha);
            offset += sizeof(double);
            cu.ParamSetd(sparseMultiply, offset, beta);
            offset += sizeof(double);
            cu.ParamSeti(sparseMultiply, offset, cellsize);
            offset += sizeof(uint);
            cu.ParamSeti(sparseMultiply, offset, cellrowsperblock);
            offset += sizeof(uint);
            cu.ParamSeti(sparseMultiply, offset, cellsperrow);
            offset += sizeof(uint);
            cu.ParamSeti(sparseMultiply, offset, stride);
            offset += sizeof(uint);
            cu.ParamSeti(sparseMultiply, offset, rowcount);
            offset += sizeof(uint);

            cu.ParamSetSize(sparseMultiply, (uint)offset);
            cu.FuncSetBlockShape(sparseMultiply, blocksize, 1, 1);
            cu.FuncSetSharedSize(sparseMultiply, (uint)(blocksize * sizeof(double) + 2 * cellrowsperblock * sizeof(int)));

            cu.LaunchGridAsync(sparseMultiply, blockcount, 1, stream);
        }
예제 #5
0
 /// <summary> see CUDA doc; </summary>
 public static void StreamDestroy(CUstream hStream)
 {
     testResult(my.cuStreamDestroy(hStream));
 }
예제 #6
0
 /// <summary> see CUDA doc; </summary>
 public static void StreamSynchronize(CUstream hStream)
 {
     testResult(my.cuStreamSynchronize(hStream));
 }
예제 #7
0
 /// <summary> see CUDA doc; </summary>
 public static CUresult StreamQuery(CUstream hStream)
 {
     return(my.cuStreamQuery(hStream));
 }
예제 #8
0
 /// <summary> see CUDA doc; </summary>
 public static void StreamCreate(out CUstream hStream, uint Flags)
 {
     testResult(my.cuStreamCreate(out hStream, Flags));
 }
예제 #9
0
 /// <summary> see CUDA doc; </summary>
 static public void LaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream)
 {
     testResult(my.cuLaunchGridAsync(f, grid_width, grid_height, hStream));
 }
예제 #10
0
 abstract internal void CallDriver(CUstream stream, double alpha, CudaVector a, double beta, CudaVector acc);