C# (CSharp) CUDA.HostAllocate примеры использования

Язык программирования: C# (CSharp)

Класс/Тип: CUDA

Метод/Функция: HostAllocate

Примеров на hotexamples.com: 7

C# (CSharp) CUDA.HostAllocate - 7 примеров найдено. Это лучшие примеры C# (CSharp) кода для CUDA.HostAllocate, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

CopyHostToDevice(19)

LoadModule(18)

Free(17)

CopyDeviceToHost(14)

Launch(14)

Allocate(11)

GetModuleFunction(8)

DestroyTexture(5)

GetModuleTexture(5)

CreateContext(5)

CreateEvent(5)

ElapsedTime(5)

HostAllocate(4)

GetModuleGlobal(4)

GetHostDevicePointer(4)

DestroyEvent(4)

Dispose(3)

AllocateHost(3)

CreateStream(3)

CopyHostToDeviceAsync(3)

LaunchAsync(2)

CopyHostToArray(2)

DestroyStream(2)

CreateArray(2)

DestroyArray(2)

FreeHost(2)

DestroyContext(2)

SetTextureArray(2)

GetDeviceCount(1)

GetDeviceAttribute(1)

Init(1)

CopyDeviceToHostAsync(1)

CopyDeviceToDevice(1)

Log(1)

Пример #1

Показать файл

        public static float[] CRSSparseMMwithDenseVector(int repetition,
                                                         string moduleFunction, int blockSizeX, int blockSizeY)
        {
            CUDA cuda = new CUDA(0, true);

            // load module

            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;

            int maxIndex = 0;

            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);


            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr   = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr    = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            int outputSize = Rows * Cols;

            float[] output = new float[outputSize];

            //allocate memory for output
            IntPtr      outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput    = cuda.GetHostDevicePointer(outputPtr2, 0);

            //create dense vector for each column in B matrix
            float[] mainVec = new float[maxIndex + 1];

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));

            CUstream stream0 = cuda.CreateStream();


            IntPtr[] mainVecIntPtrs = new IntPtr[2];

            //write combined memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //
            //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);

            mainVecIntPtrs[0] = cuda.AllocateHost(memSize);
            mainVecIntPtrs[1] = cuda.AllocateHost(memSize);
            CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0);

            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //mapped memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize);

            //get texture reference
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef");

            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            int colIndexParamOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();


            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
            int gridDim  = (Rows + blockSizeX - 1) / blockSizeX;



            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            for (int rep = 0; rep < repetition; rep++)
            {
                for (int k = 0; k < Cols; k++)
                {
                    Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]);

                    cuda.SynchronizeStream(stream0);

                    cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0);
                    cuda.SetParameter(cuFunc, colIndexParamOffset, (uint)k);
                    cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    Helpers.SetBufferIdx(BIdx, BRowLen, k - 1, mainVecIntPtrs[(k + 1) % 2], 0.0f);

                    //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr);
                    ////make asynchronius copy and kernel lauch
                    //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0);
                    //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f);
                }
            }
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Marshal.Copy(outputPtr2, output, 0, outputSize);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);


            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);
            cuda.Free(dOutput);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            cuda.DestroyStream(stream0);
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);


            return(output);
        }

Пример #2

Показать файл

        /// <summary>
        /// implementation of sparese matrix product
        /// </summary>
        /// <param name="repetition">how many times kernel should be launch</param>
        /// <param name="moduleFunction">cuda kenrel name</param>
        /// <param name="blockSizeX">block size X</param>
        /// <param name="blockSizeY">block size Y</param>
        /// <param name="transposeGrid">indicate that grid dimensions should be
        /// computed alternativly, if false than gridDimY- connected with rows
        /// else gridDim.Y conected with cols</param>
        /// <returns></returns>
        public static float[] CRSSparseMM(int repetition, string moduleFunction,
                                          int blockSizeX, int blockSizeY, bool transposeGrid)
        {
            //int blockSizeX = 4;
            //int blockSizeY = 4;

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;
            int   maxIndex = 0;

            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);


            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr   = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr    = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            CUdeviceptr BValsPtr   = cuda.CopyHostToDevice(BVals);
            CUdeviceptr BIdxPtr    = cuda.CopyHostToDevice(BIdx);
            CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen);

            int outputSize = Rows * Cols;

            float[] output = new float[outputSize];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr      outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput    = cuda.GetHostDevicePointer(outputPtr2, 0);


            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters


            int Aelements = AVals.Length;
            int Belements = BVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            cuda.SetParameter(cuFunc, offset, (uint)Aelements);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Belements);
            offset += sizeof(int);


            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();

            //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            //cuda.SetTextureFlags(cuTexRef, 0);

            int gridDimX = (int)Math.Ceiling((Cols + 0.0) / (blockSizeX));
            int gridDimY = (int)Math.Ceiling((0.0 + Rows) / blockSizeY);
            if (transposeGrid)
            {
                gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
                gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY);
            }

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);


            for (int k = 0; k < repetition; k++)
            {
                cuda.Launch(cuFunc, gridDimX, gridDimY);

                cuda.SynchronizeContext();
                //  cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, outputSize);
            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);


            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);

            cuda.Free(BValsPtr);
            cuda.Free(BIdxPtr);
            cuda.Free(BLenghtPtr);

            cuda.Free(dOutput);

            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return(output);
        }

Пример #3

Показать файл

Файл: CudafyRBFSlicedEllpackKernel.cs Проект: endeffects/KMLib

        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();



            blockSize = threadsPerRow * sliceSize;
            int N = problemElements.Length;

            blockPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize);

            align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64;

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;
            int[]   sliceStart;

            CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize);

            selfLinDot = linKernel.DiagonalDotCache;

            #region cudafy initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr       = gpu.CopyToDevice(vecVals);
            idxPtr        = gpu.CopyToDevice(vecColIdx);
            vecLenghtPtr  = gpu.CopyToDevice(vecLenght);
            sliceStartPtr = gpu.CopyToDevice(sliceStart);
            //!!!!!
            selfLinDotPtr = gpu.CopyToDevice(selfLinDot);
            labelsPtr     = gpu.CopyToDevice(Y);



            //gpu.CopyToConstantMemory(new float[] { Gamma }, GammaDev);

            //float[] GammaDev =new float[] { Gamma };
            //float[] GammaDevPtr = gpu.Allocate<float>(1);
            //gpu.CopyToConstantMemory<float>(GammaDev,GammaDevPtr);

            //float[] Gammas = new float[] { Gamma };
            //float[] GammaDev = gpu.Allocate<float>(1);
            //gpu.CopyToConstantMemory<float>(Gammas, GammaDev);


            int memSize = (problemElements.Length * sizeof(float));


            //allocate mapped memory for our results
            //outputIntPtr = gpu.HostAllocate<float>(problemElements.Length); // .HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //outputPtr = gpu.GetDeviceMemoryFromIntPtr(outputIntPtr);// cuda.GetHostDevicePointer(outputIntPtr, 0);

            outputIntPtr = cuGPU.HostAllocate((uint)memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuGPU.GetHostDevicePointer(outputIntPtr, 0);

            #endregion



            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);



            CudaHelpers.SetTextureMemory(cuGPU, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVectorPtr);
            //CudaHelpers.SetTextureMemory(cuGPU, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);
        }

Пример #4

Показать файл

Файл: CUDALinSolver.cs Проект: endeffects/KMLib

        private void SetCudaData(Problem <SparseVec> sub_prob)
        {
            int vecDim = sub_prob.Elements[0].Dim;

            /*
             * copy vectors to CUDA device
             */
            float[] vecVals;
            int[]   vecIdx;
            int[]   vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSRPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSRPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght);


            CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements);
            valsCSCPtr      = cuda.CopyHostToDevice(vecVals);
            idxCSCPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght);



            /*
             * allocate memory for gradient
             */
            uint memSize = (uint)(sub_prob.ElementsCount * sizeof(float));

            //allocate mapped memory for our results (dot product beetween vector W and all elements)
            gradIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            gradPtr    = cuda.GetHostDevicePointer(gradIntPtr, 0);

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[vecDim];


            //move W wector
            //CudaHelpers.FillDenseVector(problemElements[0], mainVector);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);


            //set texture memory for labels
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr);


            /*
             * data for cuda solver
             */

            //normaly for L2 solver QDii= xii*xii+Diag_i
            //where Diag_i = 0.5/Cp if yi=1
            //      Diag_i = 0.5/Cn if yi=-1
            //but we will add this on GPU
            QD     = new float[sub_prob.ElementsCount];
            alpha  = new float[sub_prob.ElementsCount];
            deltas = new float[sub_prob.ElementsCount];
            float[] diag = new float[3];
            for (int i = 0; i < sub_prob.ElementsCount; i++)
            {
                QD[i]     = sub_prob.Elements[i].DotProduct();
                alpha[i]  = 0f;
                deltas[i] = 0;
            }

            qdPtr = cuda.CopyHostToDevice(QD);

            alphaPtr = cuda.Allocate(alpha);


            //deltasPtr = cuda.Allocate(deltas);
            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuDeltasTexRef, "deltasTexRef", deltas, ref deltasPtr);

            diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift");
            //set this in fill function
            //cuda.CopyHostToDevice(diagPtr, diag);

            //CUdeviceptr dimPtr = cuda.GetModuleGlobal(cuModule, "Dim");
            ////todo: check if it ok
            ////cuda.Memset(dimPtr,(uint) vecDim, 1);
            //int[] dimArr = new int[] { vecDim };
            //cuda.CopyHostToDevice(dimPtr,dimArr);

            //CUDARuntime.cudaMemcpyToSymbol("Dim", dimPtr, 1, 0, cudaMemcpyKind.cudaMemcpyHostToDevice);
            //CUDARuntime.cudaMemcpyToSymbol("Dim", ,1,0, cudaMemcpyKind.cudaMemcpyHostToDevice);

            CUdeviceptr deltaScalingPtr = cuda.GetModuleGlobal(cuModule, "stepScaling");

            //two ways of computing scaling param, should be the same, but it depends on rounding.
            //stepScaling = (float)(1.0 / Math.Sqrt(sub_prob.ElementsCount));

            stepScaling = 0.0002f;// (float)(1.0 / sub_prob.ElementsCount);

            //set scaling constant
            float[] scArr = new float[] { stepScaling };
            cuda.CopyHostToDevice(deltaScalingPtr, scArr);
            //cuda.Memset(deltaScalingPtr, (uint) scaling,sizeof(float));

            //cuda.CopyHostToDevice(dimPtr, problem.Elements[0].Dim);

            SetCudaParameters(sub_prob);
        }

Пример #5

Показать файл

Файл: SparseMatrixMatrixProd.cs Проект: ksopyla/CudaDotProd

        /// <summary>
        /// implementation of sparese matrix product
        /// </summary>
        /// <param name="repetition">how many times kernel should be launch</param>
        /// <param name="moduleFunction">cuda kenrel name</param>
        /// <param name="blockSizeX">block size X</param>
        /// <param name="blockSizeY">block size Y</param>
        /// <param name="transposeGrid">indicate that grid dimensions should be 
        /// computed alternativly, if false than gridDimY- connected with rows
        /// else gridDim.Y conected with cols</param>
        /// <returns></returns>
        public static float[] CRSSparseMM(int repetition, string moduleFunction, 
            int blockSizeX,int blockSizeY, bool transposeGrid)
        {
            //int blockSizeX = 4;
            //int blockSizeY = 4;

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;
            int maxIndex = 0;
            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen,out maxIndex);

               // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen,out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            CUdeviceptr BValsPtr = cuda.CopyHostToDevice(BVals);
            CUdeviceptr BIdxPtr = cuda.CopyHostToDevice(BIdx);
            CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen);

            int outputSize = Rows * Cols;
            float[] output = new float[outputSize];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;
            int Belements = BVals.Length;

            cuda.SetFunctionBlockShape(cuFunc,blockSizeX,blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            cuda.SetParameter(cuFunc, offset, (uint)Aelements);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Belements);
            offset += sizeof(int);

            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            //cuda.SetTextureFlags(cuTexRef, 0);

            int gridDimX =(int) Math.Ceiling((Cols + 0.0) / (blockSizeX));
            int gridDimY = (int)Math.Ceiling((0.0+Rows)/blockSizeY);
            if (transposeGrid)
            {
                gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
                gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY);
            }

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            for (int k = 0; k < repetition; k++)
            {
                cuda.Launch(cuFunc, gridDimX, gridDimY);

                cuda.SynchronizeContext();
               //  cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, outputSize);
            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Console.WriteLine("Matrix products with kernel {0}",moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);

            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);

            cuda.Free(BValsPtr);
            cuda.Free(BIdxPtr);
            cuda.Free(BLenghtPtr);

            cuda.Free(dOutput);

            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }

Пример #6

Показать файл

Файл: SparseMatrixMatrixProd.cs Проект: ksopyla/CudaDotProd

        public static float[] CRSSparseMMwithDenseVector(int repetition,
            string moduleFunction, int blockSizeX, int blockSizeY)
        {
            CUDA cuda = new CUDA(0, true);

            // load module

            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;

            int maxIndex = 0;
            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            int outputSize = Rows * Cols;
            float[] output = new float[outputSize];

            //allocate memory for output
            IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            //create dense vector for each column in B matrix
            float[] mainVec = new float[maxIndex + 1];

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));

            CUstream stream0 =cuda.CreateStream();

            IntPtr[] mainVecIntPtrs= new IntPtr[2];

            //write combined memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //
            //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);

            mainVecIntPtrs[0] = cuda.AllocateHost(memSize);
            mainVecIntPtrs[1] = cuda.AllocateHost(memSize);
            CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0);

            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //mapped memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize);

            //get texture reference
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef");
            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            int colIndexParamOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
            int gridDim= (Rows + blockSizeX - 1) / blockSizeX;

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            for (int rep = 0; rep < repetition; rep++)
            {
                for (int k = 0; k < Cols; k++)
                {

                    Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]);

                    cuda.SynchronizeStream(stream0);

                    cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0);
                    cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    Helpers.SetBufferIdx(BIdx, BRowLen, k-1, mainVecIntPtrs[(k+1) % 2], 0.0f);

                    //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr);
                    ////make asynchronius copy and kernel lauch
                    //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0);
                    //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f);
                }
            }
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Marshal.Copy(outputPtr2, output, 0, outputSize);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);

            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);
            cuda.Free(dOutput);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            cuda.DestroyStream(stream0);
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);

            return output;
        }

Пример #7

Показать файл

Файл: Program.cs Проект: ksopyla/CudaDotProd

        //private static void InitMainVector(float[] vecVals, int[] vecIdx, int[] vecLenght, float[] mainVec)
        //{
        //    for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++)
        //    {
        //        int idx = vecIdx[j];
        //        float val = vecVals[j];
        //        mainVec[idx] = val;
        //    }
        //}
        private static float[] CuDotProdCSRwriteCombined(int repetition)
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction("spmv_csr_vector_kernel_wc");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();

            //temp lists for values, indices and vecotr lenght
            List<float> vecValsL = new List<float>(N * maxRowSize / 2);
            List<int> vecIdxL = new List<int>(N * maxRowSize / 2);
            List<int> vecLenghtL = new List<int>(N);

            float[] vecVals;
            int[] vecIdx;
            int[] vecLenght;

            maxIndex = 0;
            int vecStartIdx = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);
                vecValsL.AddRange(vals);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                vecIdxL.AddRange(index);

                vecLenghtL.Add(vecStartIdx);
                vecStartIdx += vecSize;

            }
            //for last index
            vecLenghtL.Add(vecStartIdx);

            vecVals = vecValsL.ToArray();
            vecIdx = vecIdxL.ToArray();
            vecLenght = vecLenghtL.ToArray();

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);
            CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);

            float[] output = new float[N];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr outputPtr2 = cuda.HostAllocate((uint)(N * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));
            uint flags = CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP | CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED;
            uint tt = (uint)CUMemHostAllocFlags.WriteCombined;
            uint s = (uint)CUMemHostAllocFlags.DeviceMap;
            IntPtr mainVecIntPtr = cuda.HostAllocate(memSize, flags);

            CUdeviceptr mainVecPtr = cuda.GetHostDevicePointer(mainVecIntPtr, 0);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters
            cuda.SetFunctionBlockShape(cuFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, mainVecPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)vecStartIdx);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            mainIndex = StartingIndex;
            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            for (int k = 0; k < repetition; k++)
            {

                //float[] tempFloatarr = new float[memSize];
               Helpers.InitBuffer(vecVals, vecIdx, vecLenght,mainIndex, mainVecIntPtr);

                //Marshal.Copy(mainVecIntPtr, tempFloatarr, 0, tempFloatarr.Length);

                cuda.Launch(cuFunc, blocksPerGrid, 1);

                cuda.SynchronizeContext();
                //cuda.CopyDeviceToHost(dOutput, output);
                Marshal.Copy(outputPtr2, output, 0, N);

                //mainVec = new float[maxIndex + 1];
                //Array.Clear(mainVec, 0, mainVec.Length);

                //clear previous vector values

                Helpers.SetBufferIdx(vecIdx, vecLenght,mainIndex, mainVecIntPtr,0.0f);
                mainIndex++;

            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            // cuda.CopyDeviceToHost(dOutput, output);

            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.Free(vecLenghtPtr);
            //cuda.DestroyArray(cuArr);
            cuda.Free(mainVecPtr);
            //cuda.DestroyTexture(cuTexRef);

               // cuda.Free(mainVecPtr);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }