示例#1
0
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();

            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;

            int align = 2;

            CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements, align);
            // CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements);

            selfLinDot = linKernel.DiagonalDotCache;

            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr      = cuda.CopyHostToDevice(vecVals);
            idxPtr       = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr = cuda.CopyHostToDevice(vecLenght);


            selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot);

            uint memSize = (uint)(problemElements.Length * sizeof(float));
            //allocate mapped memory for our results
            //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost);



            // var e= CUDADriver.cuMemHostAlloc(ref outputIntPtr, memSize, 8);
            //CUDARuntime.cudaHostAlloc(ref outputIntPtr, memSize, CUDARuntime.cudaHostAllocMapped);
            //var errMsg=CUDARuntime.cudaGetErrorString(e);
            //cuda.HostRegister(outputIntPtr,memSize, Cuda)
            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimension, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);

            if (MakeDenseVectorOnGPU)
            {
                vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim);
                vecBuilder.Init();
            }
        }
示例#2
0
        /// <summary>
        /// Predicts the specified elements.
        /// </summary>
        /// <param name="elements">The elements.</param>
        /// <returns>array of predicted labels +1 or -1</returns>
        public override float[] Predict(SparseVec[] elements)
        {
            if (!IsInitialized)
            {
                throw new ApplicationException("Evaluator is not initialized. Call init method");
            }


            //tranfsorm elements to matrix in CSR format
            // elements values
            float[] vecVals;
            //elements indexes
            int[] vecIdx;
            //elements lenght
            int[] vecLenght;
            CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, elements);

            //copy data to device, set cuda function parameters
            valsPtr      = cuda.CopyHostToDevice(vecVals);
            idxPtr       = cuda.CopyHostToDevice(vecIdx);
            vecLenghtPtr = cuda.CopyHostToDevice(vecLenght);

            //release arrays
            vecVals   = null;
            vecIdx    = null;
            vecLenght = null;

            uint memElementsSize = (uint)(elements.Length * sizeof(float));

            //allocate mapped memory for our results
            outputIntPtr = cuda.HostAllocate(memElementsSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //outputPtr = cuda.Allocate(memElementsSize);

            // Set the cuda kernel paramerters
            #region set cuda parameters
            uint Rows = (uint)elements.Length;
            uint Cols = (uint)TrainedModel.SupportElements.Length;


            cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1);

            int offset = 0;
            //set elements param
            cuda.SetParameter(cuFunc, offset, valsPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, idxPtr.Pointer);
            offset += IntPtr.Size;
            cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer);
            offset += IntPtr.Size;

            //set labels param
            cuda.SetParameter(cuFunc, offset, labelsPtr.Pointer);
            offset += IntPtr.Size;
            //set alphas param
            cuda.SetParameter(cuFunc, offset, alphasPtr.Pointer);
            offset += IntPtr.Size;
            //set output (reslut) param
            cuda.SetParameter(cuFunc, offset, outputPtr.Pointer);
            offset += IntPtr.Size;
            //set number of elements param
            cuda.SetParameter(cuFunc, offset, (uint)Rows);
            offset += sizeof(int);
            //set number of support vectors param
            cuda.SetParameter(cuFunc, offset, (uint)Cols);
            offset += sizeof(int);
            //set support vector index param
            lastParameterOffset = offset;
            cuda.SetParameter(cuFunc, offset, (uint)0);
            offset += sizeof(int);
            cuda.SetParameterSize(cuFunc, (uint)offset);
            #endregion

            int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX));


            for (int k = 0; k < TrainedModel.SupportElements.Length; k++)
            {
                //set the buffer values from k-th support vector
                CudaHelpers.InitBuffer(TrainedModel.SupportElements[k], svVecIntPtrs[k % 2]);

                cuda.SynchronizeStream(stream);
                //copy asynchronously from buffer to devece
                cuda.CopyHostToDeviceAsync(mainVecPtr, svVecIntPtrs[k % 2], memSvSize, stream);
                //set the last parameter in kernel (column index)
                // colIndexParamOffset
                cuda.SetParameter(cuFunc, lastParameterOffset, (uint)k);
                //launch kernl
                cuda.LaunchAsync(cuFunc, gridDimX, 1, stream);

                if (k > 0)
                {
                    //clear the previous host buffer
                    CudaHelpers.SetBufferIdx(TrainedModel.SupportElements[k - 1], svVecIntPtrs[(k + 1) % 2], 0.0f);
                }
            }

            //CUdeviceptr symbolAdr;
            //CUDARuntime.cudaGetSymbolAddress(ref symbolAdr,"RHO");
            rho = TrainedModel.Bias;
            //IntPtr symbolVal = new IntPtr(&rho);
            //CUDARuntime.cudaMemcpyToSymbol("RHO", symbolVal, 1, 1, cudaMemcpyKind.cudaMemcpyHostToDevice);

            cuda.SetFunctionBlockShape(cuFuncSign, blockSizeX, blockSizeY, 1);
            int signFuncOffset = 0;
            //set array param
            cuda.SetParameter(cuFuncSign, signFuncOffset, outputPtr.Pointer);
            signFuncOffset += IntPtr.Size;
            //set size
            cuda.SetParameter(cuFuncSign, signFuncOffset, Rows);
            signFuncOffset += sizeof(int);

            cuda.SetParameter(cuFuncSign, signFuncOffset, rho);
            signFuncOffset += sizeof(float);

            cuda.SetParameterSize(cuFuncSign, (uint)signFuncOffset);


            //gridDimX is valid for this function
            cuda.LaunchAsync(cuFuncSign, gridDimX, 1, stream);

            //wait for all computation
            cuda.SynchronizeContext();


            float[] result = new float[elements.Length];
            //copy result
            Marshal.Copy(outputIntPtr, result, 0, elements.Length);

            return(result);
        }
示例#3
0
        public override void Init()
        {
            linKernel.ProblemElements = problemElements;
            linKernel.Y = Y;
            linKernel.Init();

            base.Init();

            blockSize = threadsPerRow * sliceSize;
            int N = problemElements.Length;

            blocksPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize);

            align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64;


            float[] vecVals;
            int[]   vecColIdx;
            int[]   vecLenght;
            int[]   sliceStart;

            CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize);

            selfLinDot = linKernel.DiagonalDotCache;



            #region cuda initialization

            InitCudaModule();

            //copy data to device, set cuda function parameters
            valsPtr       = cuda.CopyHostToDevice(vecVals);
            idxPtr        = cuda.CopyHostToDevice(vecColIdx);
            vecLengthPtr  = cuda.CopyHostToDevice(vecLenght);
            sliceStartPtr = cuda.CopyHostToDevice(sliceStart);



            labelsPtr = cuda.CopyHostToDevice(Y);
            //!!!!!
            selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot);

            uint memSize = (uint)(problemElements.Length * sizeof(float));

            outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP);
            outputPtr    = cuda.GetHostDevicePointer(outputIntPtr, 0);

            //normal memory allocation
            //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length));


            #endregion

            SetCudaFunctionParameters();

            //allocate memory for main vector, size of this vector is the same as dimenson, so many
            //indexes will be zero, but cuda computation is faster
            mainVector = new float[problemElements[0].Dim + 1];
            CudaHelpers.FillDenseVector(problemElements[0], mainVector);

            CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr);

            // CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr);
        }
示例#4
0
        public override float[] Predict(SparseVec[] elements)
        {
            float[] prediction = new float[elements.Length];

            uint reduceSize = (uint)reductionBlocks * sizeof(float);

            int loop = (elements.Length + NUM_STREAMS - 1) / NUM_STREAMS;

            for (int i = 0; i < loop; i++)
            {
                for (int s = 0; s < NUM_STREAMS; s++)
                {
                    int idx = i * NUM_STREAMS + s;
                    if (idx < elements.Length)
                    {
                        var vec = elements[idx];

                        //remove
                        //float[] svDots = TrainedModel.SupportElements.Select(sv => sv.DotProduct(vec)).ToArray();

                        //set nonzero values to dense vector accessible through vecIntPtr
                        CudaHelpers.InitBuffer(vec, mainVecIntPtrs[s]);

                        #region sync version
                        cuda.CopyHostToDevice(mainVecCuPtr[s], mainVecIntPtrs[s], vectorsDimMemSize);

                        cuda.SetParameter(cuFuncEval, kernelResultParamOffset, evalOutputCuPtr[s]);
                        //cuda.SetParameter(cuFuncEval, vectorSelfDotParamOffset, vec.DotProduct());
                        SetCudaEvalFuncParamsForVector(vec);
                        cuda.SetParameter(cuFuncEval, texSelParamOffset, s + 1);



                        cuda.Launch(cuFuncEval, evalBlocks, 1);

                        float[] t = new float[sizeSV];
                        cuda.CopyDeviceToHost(evalOutputCuPtr[s], t);

                        cuda.SetParameter(cuFuncReduce, offsetMemToReduce, evalOutputCuPtr[s]);
                        cuda.SetParameter(cuFuncReduce, offsetOutMemReduce, reduceCuPtr[s]);
                        cuda.Launch(cuFuncReduce, reductionBlocks, 1);

                        cuda.CopyDeviceToHost(reduceCuPtr[s], reduceIntPtrs[s], reduceSize);
                        float[] r = new float[reductionBlocks];
                        cuda.CopyDeviceToHost(reduceCuPtr[s], r);
                        #endregion


                        //cuda.CopyHostToDeviceAsync(mainVecCuPtr[s], mainVecIntPtrs[s], vectorsDimMemSize, stream[s]);
                        ////cuFunc user different textures
                        //cuda.SetParameter(cuFuncEval, kernelResultParamOffset, evalOutputCuPtr[s]);
                        //cuda.SetParameter(cuFuncEval, vectorSelfDotParamOffset, vec.DotProduct());
                        //cuda.SetParameter(cuFuncEval, texSelParamOffset, s + 1);
                        //cuda.LaunchAsync(cuFuncEval, evalBlocks, 1, stream[s]);

                        //cuda.SetParameter(cuFuncReduce, offsetMemToReduce, evalOutputCuPtr[s]);
                        //cuda.SetParameter(cuFuncReduce, offsetOutMemReduce, reduceCuPtr[s]);
                        //cuda.LaunchAsync(cuFuncReduce, reductionBlocks, 1, stream[s]);

                        //cuda.CopyDeviceToHostAsync(reduceCuPtr[s], reduceIntPtrs[s], reduceSize, stream[s]);
                    }
                }

                //wait for all streams
                cuda.SynchronizeContext();

                for (int s = 0; s < NUM_STREAMS; s++)
                {
                    int idx = i * NUM_STREAMS + s;
                    if (idx < elements.Length)
                    {
                        var vec = elements[idx];
                        //clear the buffer
                        //set nonzero values to dense vector accessible thought vecIntPtr
                        CudaHelpers.SetBufferIdx(vec, mainVecIntPtrs[s], 0.0f);
                        float evalValue = ReduceOnHost(reduceIntPtrs[s], reductionBlocks);

                        prediction[idx] = evalValue;
                    }
                }
            }



            return(prediction);
        }