Exemple #1
0
        public override void Init()
        {
            cuda = new CUDA(0, true);

            var cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost);

            cuda.SetCurrentContext(cuCtx);

            cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName));
            cuFunc   = cuda.GetModuleFunction(cudaEvaluatorKernelName);

            cuFuncSign = cuda.GetModuleFunction(cudaSignKernelName);

            //reserved memory based on dimension of support vector
            //svVector = new float[TrainedModel.SupportElements[0].Count];

            stream = cuda.CreateStream();
            //memSvSize = (uint)(TrainedModel.SupportElements[0].Count * sizeof(float));
            memSvSize = (uint)(TrainedModel.SupportElements[0].Dim * sizeof(float));

            //allocates memory for buffers
            svVecIntPtrs[0] = cuda.AllocateHost(memSvSize);
            svVecIntPtrs[1] = cuda.AllocateHost(memSvSize);
            mainVecPtr      = cuda.CopyHostToDeviceAsync(svVecIntPtrs[0], memSvSize, stream);

            cuSVTexRef = cuda.GetModuleTexture(cuModule, "svTexRef");
            cuda.SetTextureFlags(cuSVTexRef, 0);
            cuda.SetTextureAddress(cuSVTexRef, mainVecPtr, memSvSize);

            //todo: copy labels and alphas

            float[] svLabels = new float[TrainedModel.SupportElements.Length];
            float[] svAlphas = new float[TrainedModel.SupportElements.Length];


            Parallel.For(0, TrainedModel.SupportElementsIndexes.Length,
                         i => {
                int idx = TrainedModel.SupportElementsIndexes[i];

                svLabels[i] = TrainedModel.Y[i];
                //svLabels[i] = TrainningProblem.Labels[idx];
                svAlphas[i] = TrainedModel.Alpha[idx];
            });

            //for (int i = 0; i < TrainedModel.SupportElementsIndexes.Length; i++)
            //{
            //    int idx = TrainedModel.SupportElementsIndexes[i];
            //    svLabels[i]= TrainningProblem.Labels[idx];
            //    svAlphas[i] = TrainedModel.Alpha[idx];

            //}

            labelsPtr = cuda.CopyHostToDevice(svLabels);
            alphasPtr = cuda.CopyHostToDevice(svAlphas);

            IsInitialized = true;
        }
Exemple #2
0
        protected void SetCudaData()
        {
            float[] svLabels = new float[sizeSV];
            float[] svAlphas = new float[sizeSV];
            Parallel.For(0, sizeSV,
                         i =>
            {
                int idx = TrainedModel.SupportElementsIndexes[i];

                svLabels[i] = TrainedModel.Y[i];
                //svLabels[i] = TrainningProblem.Labels[idx];
                svAlphas[i] = TrainedModel.Alpha[idx];
            });

            labelsPtr = cuda.CopyHostToDevice(svLabels);
            alphasPtr = cuda.CopyHostToDevice(svAlphas);


            vectorsDimMemSize = (uint)((TrainedModel.SupportElements[0].Dim + 1) * sizeof(float));
            for (int i = 0; i < NUM_STREAMS; i++)
            {
                stream[i] = cuda.CreateStream();

                //allocates memory for one vector, size = vector dim
                mainVecIntPtrs[i] = cuda.AllocateHost(vectorsDimMemSize);
                mainVecCuPtr[i]   = cuda.CopyHostToDevice(mainVecIntPtrs[i], vectorsDimMemSize);

                //allocate memory for output, size == #SV
                evalOutputCuPtr[i] = cuda.Allocate(svAlphas);

                cuVecTexRef[i] = cuda.GetModuleTexture(cuModule, cudaVecTexRefName[i]);
                //cuda.SetTextureFlags(cuVecTexRef[i], 0);
                cuda.SetTextureAddress(cuVecTexRef[i], mainVecCuPtr[i], vectorsDimMemSize);

                uint reduceMemSize = (uint)maxReductionBlocks * sizeof(float);
                reduceIntPtrs[i] = cuda.AllocateHost(reduceMemSize);
                reduceCuPtr[i]   = cuda.CopyHostToDevice(reduceIntPtrs[i], reduceMemSize);
                //reduceCuPtr[i] = cuda.Allocate((uint)maxReductionBlocks * sizeof(float));
            }
        }
Exemple #3
0
        internal static void SetTextureMemory(CUDA cuda, ref CUtexref texture, string texName, float[] data, ref CUdeviceptr memPtr)
        {
            texture = cuda.GetModuleTexture(texName);
            memPtr = cuda.CopyHostToDevice(data);
            cuda.SetTextureAddress(texture, memPtr, (uint)(sizeof(float) * data.Length));

        }
Exemple #4
0
 /// <summary>
 /// set cuda texture memory
 /// </summary>
 /// <param name="texture"></param>
 /// <param name="texName"></param>
 /// <param name="memPtr"></param>
 /// <param name="memSize"></param>
 private void SetTextureMemory(ref CUtexref texture, string texName, ref CUdeviceptr memPtr, uint memSize)
 {
     texture = cuda.GetModuleTexture(cuModule, texName);
     memPtr  = cuda.Allocate(memSize);
     cuda.SetTextureAddress(texture, memPtr, memSize);
 }
Exemple #5
0
        public static float[] CRSSparseMMwithDenseVector(int repetition,
                                                         string moduleFunction, int blockSizeX, int blockSizeY)
        {
            CUDA cuda = new CUDA(0, true);

            // load module

            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;

            int maxIndex = 0;

            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);


            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr   = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr    = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            int outputSize = Rows * Cols;

            float[] output = new float[outputSize];

            //allocate memory for output
            IntPtr      outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput    = cuda.GetHostDevicePointer(outputPtr2, 0);

            //create dense vector for each column in B matrix
            float[] mainVec = new float[maxIndex + 1];

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));

            CUstream stream0 = cuda.CreateStream();


            IntPtr[] mainVecIntPtrs = new IntPtr[2];

            //write combined memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //
            //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);

            mainVecIntPtrs[0] = cuda.AllocateHost(memSize);
            mainVecIntPtrs[1] = cuda.AllocateHost(memSize);
            CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0);

            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //mapped memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize);

            //get texture reference
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef");

            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            int colIndexParamOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end   = cuda.CreateEvent();


            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
            int gridDim  = (Rows + blockSizeX - 1) / blockSizeX;



            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            for (int rep = 0; rep < repetition; rep++)
            {
                for (int k = 0; k < Cols; k++)
                {
                    Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]);

                    cuda.SynchronizeStream(stream0);

                    cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0);
                    cuda.SetParameter(cuFunc, colIndexParamOffset, (uint)k);
                    cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    Helpers.SetBufferIdx(BIdx, BRowLen, k - 1, mainVecIntPtrs[(k + 1) % 2], 0.0f);

                    //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr);
                    ////make asynchronius copy and kernel lauch
                    //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0);
                    //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f);
                }
            }
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Marshal.Copy(outputPtr2, output, 0, outputSize);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);


            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);
            cuda.Free(dOutput);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            cuda.DestroyStream(stream0);
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);


            return(output);
        }
Exemple #6
0
 internal static void SetTextureMemory(CUDA cuda, ref CUtexref texture, string texName, float[] data, ref CUdeviceptr memPtr)
 {
     texture = cuda.GetModuleTexture(texName);
     memPtr  = cuda.CopyHostToDevice(data);
     cuda.SetTextureAddress(texture, memPtr, (uint)(sizeof(float) * data.Length));
 }
        public static float[] CRSSparseMMwithDenseVector(int repetition,
            string moduleFunction, int blockSizeX, int blockSizeY)
        {
            CUDA cuda = new CUDA(0, true);

            // load module

            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("------------------------------------");
            Console.WriteLine("init Matrix");
            Stopwatch t = Stopwatch.StartNew();

            //values in CRS format
            float[] AVals, BVals;
            //indexes in Crs format
            int[] AIdx, BIdx;
            //Lenght of each row in CRS format
            int[] ARowLen, BRowLen;

            int maxIndex = 0;
            MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex);

            // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex);
            MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex);
            //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex);

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals);
            CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx);
            CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen);

            int outputSize = Rows * Cols;
            float[] output = new float[outputSize];

            //allocate memory for output
            IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            //create dense vector for each column in B matrix
            float[] mainVec = new float[maxIndex + 1];

            uint memSize = (uint)((maxIndex + 1) * sizeof(float));

            CUstream stream0 =cuda.CreateStream();

            IntPtr[] mainVecIntPtrs= new IntPtr[2];

            //write combined memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //
            //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);
            //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED);

            mainVecIntPtrs[0] = cuda.AllocateHost(memSize);
            mainVecIntPtrs[1] = cuda.AllocateHost(memSize);
            CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0);

            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE);
            //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0);

            //mapped memory allocation
            //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize);

            //get texture reference
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef");
            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters

            int Aelements = AVals.Length;

            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);

            int colIndexParamOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));
            int gridDim= (Rows + blockSizeX - 1) / blockSizeX;

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            for (int rep = 0; rep < repetition; rep++)
            {
                for (int k = 0; k < Cols; k++)
                {

                    Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]);

                    cuda.SynchronizeStream(stream0);

                    cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0);
                    cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    Helpers.SetBufferIdx(BIdx, BRowLen, k-1, mainVecIntPtrs[(k+1) % 2], 0.0f);

                    //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr);
                    ////make asynchronius copy and kernel lauch
                    //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0);
                    //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k);
                    //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0);
                    //cuda.SynchronizeStream(stream0);
                    ////clear host buffer
                    //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f);
                }
            }
            cuda.RecordEvent(end);
            cuda.SynchronizeContext();

            timer.Stop();
            float cudaTime = cuda.ElapsedTime(start, end);

            Marshal.Copy(outputPtr2, output, 0, outputSize);

            Console.WriteLine("Matrix products with kernel {0}", moduleFunction);
            Console.WriteLine("  takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed);

            int lenght = displayCount;// Math.Min(displayCount, Rows);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(AValsPtr);
            cuda.Free(AIdxPtr);
            cuda.Free(ALenghtPtr);
            cuda.Free(dOutput);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            cuda.DestroyStream(stream0);
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);

            return output;
        }
Exemple #8
0
        private static float[] CuRBFCSRCached()
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction structPassFunc = cuda.GetModuleFunction("RBFspmv_csr_vector");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();
            List<float> vecValsL = new List<float>(N * maxRowSize / 2);
            List<int> vecIdxL = new List<int>(N * maxRowSize / 2);
            List<int> vecLenghtL = new List<int>(N);

            float[] vecVals;
            int[] vecIdx;
            int[] vecLenght;
            float[] selfDot = new float[N];

            maxIndex = 0;
            int vecStartIdx = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);
                vecValsL.AddRange(vals);

                for (int z = 0; z < vals.Length; z++)
                {
                    selfDot[i] += vals[z] * vals[z];
                }
                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                vecIdxL.AddRange(index);

                vecLenghtL.Add(vecStartIdx);
                vecStartIdx += vecSize;

            }
            //for last index
            vecLenghtL.Add(vecStartIdx);

            vecVals = vecValsL.ToArray();
            vecIdx = vecIdxL.ToArray();
            vecLenght = vecLenghtL.ToArray();

            float[] mainVec = new float[maxIndex + 1];

            for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++)
            {
                int idx = vecIdx[j];
                float val = vecVals[j];
                mainVec[idx] = val;
            }
            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);
            CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);
            CUdeviceptr selfDotPtr = cuda.CopyHostToDevice(selfDot);

            //copy to texture
            CUarray cuArr = cuda.CreateArray(mainVec);
            cuda.CopyHostToArray(cuArr, mainVec, 0);
            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);
            cuda.SetTextureArray(cuTexRef, cuArr);

            float[] output = new float[N];
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, selfDotPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)mainIndex);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, Gamma);
            offset += sizeof(float);

            cuda.SetParameter(structPassFunc, offset, (uint)vecStartIdx);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.Free(selfDotPtr);
            cuda.Free(vecLenghtPtr);
            cuda.DestroyArray(cuArr);
            cuda.DestroyTexture(cuTexRef);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }
Exemple #9
0
        private static float[] CuDotProdEllPackTexCached()
        {
            //always the same values
            Random rnd = new Random(1);

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction structPassFunc = cuda.GetModuleFunction("DotProdEllPackCached");

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();
            float[] vecVals = new float[N * maxRowSize];
            int[] vecIdx = new int[N * maxRowSize];

            maxIndex = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);

                //values are column-major aligment
                for (int z = 0; z < vals.Length; z++)
                {
                    int m = z * N + i;
                    vecVals[m] = vals[z];
                }

                //Array.Copy(vals,0,vecVals,i*maxRowSize,vals.Length);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                //Array.Copy(index, 0, vecIdx, i * maxRowSize, index.Length);
                for (int z = 0; z < index.Length; z++)
                {
                    int m = z * N + i;
                    vecIdx[m] = index[z];
                }

            }

            float[] mainVec = new float[maxIndex + 1];

            for (int j = 0; j < maxRowSize; j++)
            {
                int idx = vecIdx[mainIndex + N * j];
                float val = vecVals[mainIndex + N * j];
                mainVec[idx] = val;
            }
            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);

            CUarray cuArr = cuda.CreateArray(mainVec);
            cuda.CopyHostToArray(cuArr, mainVec, 0);

            //CUDAArrayDescriptor cuDesc = new CUDAArrayDescriptor();
            //cuDesc.Format = CUArrayFormat.Float;
            //cuDesc.NumChannels = 1;
            //cuDesc.Width = maxIndex+1;

            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);

            cuda.SetTextureArray(cuTexRef, cuArr);

            float[] output = new float[N];
            CUdeviceptr dOutput = cuda.Allocate(output);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);

            cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(structPassFunc, offset, (uint)maxRowSize);
            offset += sizeof(int);
            cuda.SetParameter(structPassFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameterSize(structPassFunc, (uint)offset);

            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);
            cuda.Launch(structPassFunc, blocksPerGrid, 1);

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);
            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("EllPack Cached Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed);

            cuda.CopyDeviceToHost(dOutput, output);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.DestroyArray(cuArr);
            cuda.DestroyTexture(cuTexRef);
            return output;
        }
Exemple #10
0
        private static float[] CuDotProdCRSCached(int repetition,string moduleFunction)
        {
            //always the same values

            CUDA cuda = new CUDA(0, true);

            // load module
            CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin"));

            CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction);

            int maxRowSize = avgElements + stdElements - 1;

            Console.WriteLine("init arrays");
            Stopwatch t = Stopwatch.StartNew();

            //temp lists for values, indices and vecotr lenght
            List<float> vecValsL = new List<float>(N * maxRowSize / 2);
            List<int> vecIdxL = new List<int>(N * maxRowSize / 2);
            List<int> vecLenghtL = new List<int>(N+1);

            float[] vecVals;
            int[] vecIdx;
            int[] vecLenght;

            maxIndex = 0;
            int vecStartIdx = 0;
            for (int i = 0; i < N; i++)
            {
                int vecSize = avgElements + i % stdElements;

                float[] vals = Helpers.InitValues(i, vecSize, maxVal);
                vecValsL.AddRange(vals);

                int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex);
                vecIdxL.AddRange(index);

                vecLenghtL.Add(vecStartIdx);
                vecStartIdx += vecSize;

            }
            //for last index
            vecLenghtL.Add(vecStartIdx);

            vecVals = vecValsL.ToArray();
            vecIdx = vecIdxL.ToArray();
            vecLenght = vecLenghtL.ToArray();

            Console.WriteLine("Init takes {0}", t.Elapsed);
            t.Start();

            CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals);
            CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx);
            CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);

            float[] output = new float[N];
            //CUdeviceptr dOutput = cuda.Allocate(output);

            IntPtr outputPtr2 = cuda.HostAllocate((uint)(N * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0);

            Console.WriteLine("copy to device takes {0}", t.Elapsed);
            #region set cuda parameters
            cuda.SetFunctionBlockShape(cuFunc, threadsPerBlock, 1, 1);

            int offset = 0;
            cuda.SetParameter(cuFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, dOutput.Pointer);
            offset += IntPtr.Size;

            cuda.SetParameter(cuFunc, offset, (uint)N);
            offset += sizeof(int);
            cuda.SetParameter(cuFunc, offset, (uint)vecStartIdx);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion
            Console.WriteLine("start computation");

            CUevent start = cuda.CreateEvent();
            CUevent end = cuda.CreateEvent();

            CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef");
            cuda.SetTextureFlags(cuTexRef, 0);

            float[] mainVec = new float[maxIndex + 1];

            CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVec);

            uint memSize = (uint)((maxIndex+1) * sizeof(float));
            //uint flags = CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP | CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED;
            //uint tt = (uint)CUMemHostAllocFlags.WriteCombined;
            //uint s = (uint)CUMemHostAllocFlags.DeviceMap;
            //IntPtr mainVecIntPtr = cuda.HostAllocate(memSize, flags);
            //CUdeviceptr mainVecPtr = cuda.GetHostDevicePointer(mainVecIntPtr, 0);

            cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize);

            //CUarray cuArr = cuda.CreateArray(mainVec);
            //cuda.SetTextureArray(cuTexRef, cuArr);

            mainIndex = StartingIndex;
            Stopwatch timer = Stopwatch.StartNew();
            cuda.RecordEvent(start);

            for (int k = 0; k < repetition; k++)
            {
                //normal memory management
              Helpers.InitMainVector(vecVals, vecIdx, vecLenght,mainIndex,ref mainVec);

                ////copy to texture
                ////cuda.CopyHostToArray(cuArr, mainVec, 0);
                cuda.CopyHostToDevice(mainVecPtr, mainVec);

                cuda.Launch(cuFunc, blocksPerGrid, 1);

                cuda.SynchronizeContext();
               // cuda.CopyDeviceToHost(dOutput, output);
                 Marshal.Copy(outputPtr2, output, 0, N);

                mainIndex++;
            }

            cuda.RecordEvent(end);

            cuda.SynchronizeContext();
            //cuda.SynchronizeEvent(end);

            // cuda.CopyDeviceToHost(dOutput, output);

            timer.Stop();
            float naiveTime = cuda.ElapsedTime(start, end);

            Console.Write("Dot products with kernel {0}, mainIndex {1} and {2}-vectors takes {3} ms stopwatch time {4} ms",moduleFunction, mainIndex, N, naiveTime, timer.Elapsed);

            int lenght = Math.Min(displayCount, N);
            Console.WriteLine();
            for (int i = 0; i < lenght; i++)
            {
                Console.WriteLine("{0}-{1}", i, output[i]);
            }

            cuda.Free(valsPtr);
            cuda.Free(idxPtr);
            cuda.Free(dOutput);
            cuda.Free(vecLenghtPtr);
            //cuda.DestroyArray(cuArr);
            //cuda.FreeHost(outputPtr2);
            //cuda.Free(dOutput);
            //Marshal.FreeHGlobal(
            cuda.Free(mainVecPtr);
            cuda.DestroyTexture(cuTexRef);
            cuda.DestroyEvent(start);
            cuda.DestroyEvent(end);

            return output;
        }