Beispiel #1
0
        public static float[] CRSSparseMMwithDenseVector(int repetition,
                                                         string moduleFunction, int blockSizeX, int blockSizeY)
        {
            CUDA cuda = new CUDA(0, true);

            // load module

            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;

            int maxIndex = 0;

            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);


            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr   = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr    = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            int outputSize = Rows * Cols;

            float[] output = new float[outputSize];

            //allocate memory for output
            IntPtr      outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput    = cuda.GetHostDevicePointer(outputPtr2, 0);

            //create dense vector for each column in B matrix
            float[] mainVec = new float[maxIndex + 1];

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));

            CUstream stream0 = cuda.CreateStream();


            IntPtr[] mainVecIntPtrs = new IntPtr[2];

            //write combined memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //
            //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);

            mainVecIntPtrs[0] = cuda.AllocateHost(memSize);
            mainVecIntPtrs[1] = cuda.AllocateHost(memSize);
            CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0);

            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //mapped memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize);

            //get texture reference
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef");

            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            int colIndexParamOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();


            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
            int gridDim  = (Rows + blockSizeX - 1) / blockSizeX;



            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            for (int rep = 0; rep < repetition; rep++)
            {
                for (int k = 0; k < Cols; k++)
                {
                    Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]);

                    cuda.SynchronizeStream(stream0);

                    cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0);
                    cuda.SetParameter(cuFunc, colIndexParamOffset, (uint)k);
                    cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    Helpers.SetBufferIdx(BIdx, BRowLen, k - 1, mainVecIntPtrs[(k + 1) % 2], 0.0f);

                    //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr);
                    ////make asynchronius copy and kernel lauch
                    //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0);
                    //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f);
                }
            }
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Marshal.Copy(outputPtr2, output, 0, outputSize);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);


            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);
            cuda.Free(dOutput);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            cuda.DestroyStream(stream0);
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);


            return(output);
        }
Beispiel #2
0
        /// <summary>
        /// implementation of sparese matrix product
        /// </summary>
        /// <param name="repetition">how many times kernel should be launch</param>
        /// <param name="moduleFunction">cuda kenrel name</param>
        /// <param name="blockSizeX">block size X</param>
        /// <param name="blockSizeY">block size Y</param>
        /// <param name="transposeGrid">indicate that grid dimensions should be
        /// computed alternativly, if false than gridDimY- connected with rows
        /// else gridDim.Y conected with cols</param>
        /// <returns></returns>
        public static float[] CRSSparseMM(int repetition, string moduleFunction,
                                          int blockSizeX, int blockSizeY, bool transposeGrid)
        {
            //int blockSizeX = 4;
            //int blockSizeY = 4;

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;
            int   maxIndex = 0;

            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);


            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr   = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr    = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            CUdeviceptr BValsPtr   = cuda.CopyHostToDevice(BVals);
            CUdeviceptr BIdxPtr    = cuda.CopyHostToDevice(BIdx);
            CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen);

            int outputSize = Rows * Cols;

            float[] output = new float[outputSize];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr      outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput    = cuda.GetHostDevicePointer(outputPtr2, 0);


            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters


            int Aelements = AVals.Length;
            int Belements = BVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            cuda.SetParameter(cuFunc, offset, (uint)Aelements);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Belements);
            offset += sizeof(int);


            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();

            //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            //cuda.SetTextureFlags(cuTexRef, 0);

            int gridDimX = (int)Math.Ceiling((Cols + 0.0) / (blockSizeX));
            int gridDimY = (int)Math.Ceiling((0.0 + Rows) / blockSizeY);
            if (transposeGrid)
            {
                gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
                gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY);
            }

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);


            for (int k = 0; k < repetition; k++)
            {
                cuda.Launch(cuFunc, gridDimX, gridDimY);

                cuda.SynchronizeContext();
                //  cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, outputSize);
            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);


            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);

            cuda.Free(BValsPtr);
            cuda.Free(BIdxPtr);
            cuda.Free(BLenghtPtr);

            cuda.Free(dOutput);

            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return(output);
        }
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();



            blockSize = threadsPerRow * sliceSize;
            int N = problemElements.Length;

            blockPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize);

            align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64;

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;
            int[]   sliceStart;

            CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize);

            selfLinDot = linKernel.DiagonalDotCache;

            #region cudafy initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr       = gpu.CopyToDevice(vecVals);
            idxPtr        = gpu.CopyToDevice(vecColIdx);
            vecLenghtPtr  = gpu.CopyToDevice(vecLenght);
            sliceStartPtr = gpu.CopyToDevice(sliceStart);
            //!!!!!
            selfLinDotPtr = gpu.CopyToDevice(selfLinDot);
            labelsPtr     = gpu.CopyToDevice(Y);



            //gpu.CopyToConstantMemory(new float[] { Gamma }, GammaDev);

            //float[] GammaDev =new float[] { Gamma };
            //float[] GammaDevPtr = gpu.Allocate<float>(1);
            //gpu.CopyToConstantMemory<float>(GammaDev,GammaDevPtr);

            //float[] Gammas = new float[] { Gamma };
            //float[] GammaDev = gpu.Allocate<float>(1);
            //gpu.CopyToConstantMemory<float>(Gammas, GammaDev);


            int memSize = (problemElements.Length * sizeof(float));


            //allocate mapped memory for our results
            //outputIntPtr = gpu.HostAllocate<float>(problemElements.Length); // .HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //outputPtr = gpu.GetDeviceMemoryFromIntPtr(outputIntPtr);// cuda.GetHostDevicePointer(outputIntPtr, 0);

            outputIntPtr = cuGPU.HostAllocate((uint)memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuGPU.GetHostDevicePointer(outputIntPtr, 0);

            #endregion



            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);



            CudaHelpers.SetTextureMemory(cuGPU, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVectorPtr);
            //CudaHelpers.SetTextureMemory(cuGPU, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);
        }
Beispiel #4
0
        private void SetCudaData(Problem <SparseVec> sub_prob)
        {
            int vecDim = sub_prob.Elements[0].Dim;

            /*
             * copy vectors to CUDA device
             */
            float[] vecVals;
            int[]   vecIdx;
            int[]   vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSRPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSRPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght);


            CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSCPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSCPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght);



            /*
             * allocate memory for gradient
             */
            uint memSize = (uint)(sub_prob.ElementsCount * sizeof(float));

            //allocate mapped memory for our results (dot product beetween vector W and all elements)
            gradIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            gradPtr    = cuda.GetHostDevicePointer(gradIntPtr, 0);

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[vecDim];


            //move W wector
            //CudaHelpers.FillDenseVector(problemElements[0], mainVector);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);


            //set texture memory for labels
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr);


            /*
             * data for cuda solver
             */

            //normaly for L2 solver QDii= xii*xii+Diag_i
            //where Diag_i = 0.5/Cp if yi=1
            //      Diag_i = 0.5/Cn if yi=-1
            //but we will add this on GPU
            QD     = new float[sub_prob.ElementsCount];
            alpha  = new float[sub_prob.ElementsCount];
            deltas = new float[sub_prob.ElementsCount];
            float[] diag = new float[3];
            for (int i = 0; i < sub_prob.ElementsCount; i++)
            {
                QD[i]     = sub_prob.Elements[i].DotProduct();
                alpha[i]  = 0f;
                deltas[i] = 0;
            }

            qdPtr = cuda.CopyHostToDevice(QD);

            alphaPtr = cuda.Allocate(alpha);


            //deltasPtr = cuda.Allocate(deltas);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuDeltasTexRef, "deltasTexRef", deltas, ref deltasPtr);

            diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift");
            //set this in fill function
            //cuda.CopyHostToDevice(diagPtr, diag);

            //CUdeviceptr dimPtr = cuda.GetModuleGlobal(cuModule, "Dim");
            ////todo: check if it ok
            ////cuda.Memset(dimPtr,(uint) vecDim, 1);
            //int[] dimArr = new int[] { vecDim };
            //cuda.CopyHostToDevice(dimPtr,dimArr);

            //CUDARuntime.cudaMemcpyToSymbol("Dim", dimPtr, 1, 0, cudaMemcpyKind.cudaMemcpyHostToDevice);
            //CUDARuntime.cudaMemcpyToSymbol("Dim", ,1,0, cudaMemcpyKind.cudaMemcpyHostToDevice);

            CUdeviceptr deltaScalingPtr = cuda.GetModuleGlobal(cuModule, "stepScaling");

            //two ways of computing scaling param, should be the same, but it depends on rounding.
            //stepScaling = (float)(1.0 / Math.Sqrt(sub_prob.ElementsCount));

            stepScaling = 0.0002f;// (float)(1.0 / sub_prob.ElementsCount);

            //set scaling constant
            float[] scArr = new float[] { stepScaling };
            cuda.CopyHostToDevice(deltaScalingPtr, scArr);
            //cuda.Memset(deltaScalingPtr, (uint) scaling,sizeof(float));

            //cuda.CopyHostToDevice(dimPtr, problem.Elements[0].Dim);

            SetCudaParameters(sub_prob);
        }
        /// <summary>
        /// implementation of sparese matrix product
        /// </summary>
        /// <param name="repetition">how many times kernel should be launch</param>
        /// <param name="moduleFunction">cuda kenrel name</param>
        /// <param name="blockSizeX">block size X</param>
        /// <param name="blockSizeY">block size Y</param>
        /// <param name="transposeGrid">indicate that grid dimensions should be 
        /// computed alternativly, if false than gridDimY- connected with rows
        /// else gridDim.Y conected with cols</param>
        /// <returns></returns>
        public static float[] CRSSparseMM(int repetition, string moduleFunction, 
            int blockSizeX,int blockSizeY, bool transposeGrid)
        {
            //int blockSizeX = 4;
            //int blockSizeY = 4;

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;
            int maxIndex = 0;
            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen,out maxIndex);

               // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen,out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            CUdeviceptr BValsPtr = cuda.CopyHostToDevice(BVals);
            CUdeviceptr BIdxPtr = cuda.CopyHostToDevice(BIdx);
            CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen);

            int outputSize = Rows * Cols;
            float[] output = new float[outputSize];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;
            int Belements = BVals.Length;

            cuda.SetFunctionBlockShape(cuFunc,blockSizeX,blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            cuda.SetParameter(cuFunc, offset, (uint)Aelements);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Belements);
            offset += sizeof(int);

            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            //cuda.SetTextureFlags(cuTexRef, 0);

            int gridDimX =(int) Math.Ceiling((Cols + 0.0) / (blockSizeX));
            int gridDimY = (int)Math.Ceiling((0.0+Rows)/blockSizeY);
            if (transposeGrid)
            {
                gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
                gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY);
            }

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            for (int k = 0; k < repetition; k++)
            {
                cuda.Launch(cuFunc, gridDimX, gridDimY);

                cuda.SynchronizeContext();
               //  cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, outputSize);
            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Matrix products with kernel {0}",moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);

            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);

            cuda.Free(BValsPtr);
            cuda.Free(BIdxPtr);
            cuda.Free(BLenghtPtr);

            cuda.Free(dOutput);

            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }
        public static float[] CRSSparseMMwithDenseVector(int repetition,
            string moduleFunction, int blockSizeX, int blockSizeY)
        {
            CUDA cuda = new CUDA(0, true);

            // load module

            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;

            int maxIndex = 0;
            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            int outputSize = Rows * Cols;
            float[] output = new float[outputSize];

            //allocate memory for output
            IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            //create dense vector for each column in B matrix
            float[] mainVec = new float[maxIndex + 1];

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));

            CUstream stream0 =cuda.CreateStream();

            IntPtr[] mainVecIntPtrs= new IntPtr[2];

            //write combined memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //
            //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);

            mainVecIntPtrs[0] = cuda.AllocateHost(memSize);
            mainVecIntPtrs[1] = cuda.AllocateHost(memSize);
            CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0);

            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //mapped memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize);

            //get texture reference
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef");
            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            int colIndexParamOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
            int gridDim= (Rows + blockSizeX - 1) / blockSizeX;

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            for (int rep = 0; rep < repetition; rep++)
            {
                for (int k = 0; k < Cols; k++)
                {

                    Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]);

                    cuda.SynchronizeStream(stream0);

                    cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0);
                    cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    Helpers.SetBufferIdx(BIdx, BRowLen, k-1, mainVecIntPtrs[(k+1) % 2], 0.0f);

                    //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr);
                    ////make asynchronius copy and kernel lauch
                    //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0);
                    //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f);
                }
            }
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Marshal.Copy(outputPtr2, output, 0, outputSize);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);

            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);
            cuda.Free(dOutput);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            cuda.DestroyStream(stream0);
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);

            return output;
        }
Beispiel #7
0
        //private static void InitMainVector(float[] vecVals, int[] vecIdx, int[] vecLenght, float[] mainVec)
        //{
        //    for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++)
        //    {
        //        int idx = vecIdx[j];
        //        float val = vecVals[j];
        //        mainVec[idx] = val;
        //    }
        //}
        private static float[] CuDotProdCSRwriteCombined(int repetition)
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction("spmv_csr_vector_kernel_wc");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();

            //temp lists for values, indices and vecotr lenght
            List<float> vecValsL = new List<float>(N * maxRowSize / 2);
            List<int> vecIdxL = new List<int>(N * maxRowSize / 2);
            List<int> vecLenghtL = new List<int>(N);

            float[] vecVals;
            int[] vecIdx;
            int[] vecLenght;

            maxIndex = 0;
            int vecStartIdx = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);
                vecValsL.AddRange(vals);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                vecIdxL.AddRange(index);

                vecLenghtL.Add(vecStartIdx);
                vecStartIdx += vecSize;

            }
            //for last index
            vecLenghtL.Add(vecStartIdx);

            vecVals = vecValsL.ToArray();
            vecIdx = vecIdxL.ToArray();
            vecLenght = vecLenghtL.ToArray();

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);
            CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);

            float[] output = new float[N];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr outputPtr2 = cuda.HostAllocate((uint)(N * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));
            uint flags = CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP | CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED;
            uint tt = (uint)CUMemHostAllocFlags.WriteCombined;
            uint s = (uint)CUMemHostAllocFlags.DeviceMap;
            IntPtr mainVecIntPtr = cuda.HostAllocate(memSize, flags);

            CUdeviceptr mainVecPtr = cuda.GetHostDevicePointer(mainVecIntPtr, 0);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters
            cuda.SetFunctionBlockShape(cuFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, mainVecPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)vecStartIdx);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            mainIndex = StartingIndex;
            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            for (int k = 0; k < repetition; k++)
            {

                //float[] tempFloatarr = new float[memSize];
               Helpers.InitBuffer(vecVals, vecIdx, vecLenght,mainIndex, mainVecIntPtr);

                //Marshal.Copy(mainVecIntPtr, tempFloatarr, 0, tempFloatarr.Length);

                cuda.Launch(cuFunc, blocksPerGrid, 1);

                cuda.SynchronizeContext();
                //cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, N);

                //mainVec = new float[maxIndex + 1];
                //Array.Clear(mainVec, 0, mainVec.Length);

                //clear previous vector values

                Helpers.SetBufferIdx(vecIdx, vecLenght,mainIndex, mainVecIntPtr,0.0f);
                mainIndex++;

            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            // cuda.CopyDeviceToHost(dOutput, output);

            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.Free(vecLenghtPtr);
            //cuda.DestroyArray(cuArr);
            cuda.Free(mainVecPtr);
            //cuda.DestroyTexture(cuTexRef);

               // cuda.Free(mainVecPtr);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }