public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); float[] vecVals; int[] vecColIdx; int[] vecLenght; int align = 2; CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements, align); // CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements); selfLinDot = linKernel.DiagonalDotCache; #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot); uint memSize = (uint)(problemElements.Length * sizeof(float)); //allocate mapped memory for our results //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost); // var e= CUDADriver.cuMemHostAlloc(ref outputIntPtr, memSize, 8); //CUDARuntime.cudaHostAlloc(ref outputIntPtr, memSize, CUDARuntime.cudaHostAllocMapped); //var errMsg=CUDARuntime.cudaGetErrorString(e); //cuda.HostRegister(outputIntPtr,memSize, Cuda) outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimension, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); if (MakeDenseVectorOnGPU) { vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim); vecBuilder.Init(); } }
/// <summary> /// Predicts the specified elements. /// </summary> /// <param name="elements">The elements.</param> /// <returns>array of predicted labels +1 or -1</returns> public override float[] Predict(SparseVec[] elements) { if (!IsInitialized) { throw new ApplicationException("Evaluator is not initialized. Call init method"); } //tranfsorm elements to matrix in CSR format // elements values float[] vecVals; //elements indexes int[] vecIdx; //elements lenght int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, elements); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtPtr = cuda.CopyHostToDevice(vecLenght); //release arrays vecVals = null; vecIdx = null; vecLenght = null; uint memElementsSize = (uint)(elements.Length * sizeof(float)); //allocate mapped memory for our results outputIntPtr = cuda.HostAllocate(memElementsSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //outputPtr = cuda.Allocate(memElementsSize); // Set the cuda kernel paramerters #region set cuda parameters uint Rows = (uint)elements.Length; uint Cols = (uint)TrainedModel.SupportElements.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; //set elements param cuda.SetParameter(cuFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer); offset += IntPtr.Size; //set labels param cuda.SetParameter(cuFunc, offset, labelsPtr.Pointer); offset += IntPtr.Size; //set alphas param cuda.SetParameter(cuFunc, offset, alphasPtr.Pointer); offset += IntPtr.Size; //set output (reslut) param cuda.SetParameter(cuFunc, offset, outputPtr.Pointer); offset += IntPtr.Size; //set number of elements param cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); //set number of support vectors param cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); //set support vector index param lastParameterOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); for (int k = 0; k < TrainedModel.SupportElements.Length; k++) { //set the buffer values from k-th support vector CudaHelpers.InitBuffer(TrainedModel.SupportElements[k], svVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream); //copy asynchronously from buffer to devece cuda.CopyHostToDeviceAsync(mainVecPtr, svVecIntPtrs[k % 2], memSvSize, stream); //set the last parameter in kernel (column index) // colIndexParamOffset cuda.SetParameter(cuFunc, lastParameterOffset, (uint)k); //launch kernl cuda.LaunchAsync(cuFunc, gridDimX, 1, stream); if (k > 0) { //clear the previous host buffer CudaHelpers.SetBufferIdx(TrainedModel.SupportElements[k - 1], svVecIntPtrs[(k + 1) % 2], 0.0f); } } //CUdeviceptr symbolAdr; //CUDARuntime.cudaGetSymbolAddress(ref symbolAdr,"RHO"); rho = TrainedModel.Bias; //IntPtr symbolVal = new IntPtr(&rho); //CUDARuntime.cudaMemcpyToSymbol("RHO", symbolVal, 1, 1, cudaMemcpyKind.cudaMemcpyHostToDevice); cuda.SetFunctionBlockShape(cuFuncSign, blockSizeX, blockSizeY, 1); int signFuncOffset = 0; //set array param cuda.SetParameter(cuFuncSign, signFuncOffset, outputPtr.Pointer); signFuncOffset += IntPtr.Size; //set size cuda.SetParameter(cuFuncSign, signFuncOffset, Rows); signFuncOffset += sizeof(int); cuda.SetParameter(cuFuncSign, signFuncOffset, rho); signFuncOffset += sizeof(float); cuda.SetParameterSize(cuFuncSign, (uint)signFuncOffset); //gridDimX is valid for this function cuda.LaunchAsync(cuFuncSign, gridDimX, 1, stream); //wait for all computation cuda.SynchronizeContext(); float[] result = new float[elements.Length]; //copy result Marshal.Copy(outputIntPtr, result, 0, elements.Length); return(result); }
public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); blockSize = threadsPerRow * sliceSize; int N = problemElements.Length; blocksPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize); align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64; float[] vecVals; int[] vecColIdx; int[] vecLenght; int[] sliceStart; CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize); selfLinDot = linKernel.DiagonalDotCache; #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); sliceStartPtr = cuda.CopyHostToDevice(sliceStart); labelsPtr = cuda.CopyHostToDevice(Y); //!!!!! selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot); uint memSize = (uint)(problemElements.Length * sizeof(float)); outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); // CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }
public override float[] Predict(SparseVec[] elements) { float[] prediction = new float[elements.Length]; uint reduceSize = (uint)reductionBlocks * sizeof(float); int loop = (elements.Length + NUM_STREAMS - 1) / NUM_STREAMS; for (int i = 0; i < loop; i++) { for (int s = 0; s < NUM_STREAMS; s++) { int idx = i * NUM_STREAMS + s; if (idx < elements.Length) { var vec = elements[idx]; //remove //float[] svDots = TrainedModel.SupportElements.Select(sv => sv.DotProduct(vec)).ToArray(); //set nonzero values to dense vector accessible through vecIntPtr CudaHelpers.InitBuffer(vec, mainVecIntPtrs[s]); #region sync version cuda.CopyHostToDevice(mainVecCuPtr[s], mainVecIntPtrs[s], vectorsDimMemSize); cuda.SetParameter(cuFuncEval, kernelResultParamOffset, evalOutputCuPtr[s]); //cuda.SetParameter(cuFuncEval, vectorSelfDotParamOffset, vec.DotProduct()); SetCudaEvalFuncParamsForVector(vec); cuda.SetParameter(cuFuncEval, texSelParamOffset, s + 1); cuda.Launch(cuFuncEval, evalBlocks, 1); float[] t = new float[sizeSV]; cuda.CopyDeviceToHost(evalOutputCuPtr[s], t); cuda.SetParameter(cuFuncReduce, offsetMemToReduce, evalOutputCuPtr[s]); cuda.SetParameter(cuFuncReduce, offsetOutMemReduce, reduceCuPtr[s]); cuda.Launch(cuFuncReduce, reductionBlocks, 1); cuda.CopyDeviceToHost(reduceCuPtr[s], reduceIntPtrs[s], reduceSize); float[] r = new float[reductionBlocks]; cuda.CopyDeviceToHost(reduceCuPtr[s], r); #endregion //cuda.CopyHostToDeviceAsync(mainVecCuPtr[s], mainVecIntPtrs[s], vectorsDimMemSize, stream[s]); ////cuFunc user different textures //cuda.SetParameter(cuFuncEval, kernelResultParamOffset, evalOutputCuPtr[s]); //cuda.SetParameter(cuFuncEval, vectorSelfDotParamOffset, vec.DotProduct()); //cuda.SetParameter(cuFuncEval, texSelParamOffset, s + 1); //cuda.LaunchAsync(cuFuncEval, evalBlocks, 1, stream[s]); //cuda.SetParameter(cuFuncReduce, offsetMemToReduce, evalOutputCuPtr[s]); //cuda.SetParameter(cuFuncReduce, offsetOutMemReduce, reduceCuPtr[s]); //cuda.LaunchAsync(cuFuncReduce, reductionBlocks, 1, stream[s]); //cuda.CopyDeviceToHostAsync(reduceCuPtr[s], reduceIntPtrs[s], reduceSize, stream[s]); } } //wait for all streams cuda.SynchronizeContext(); for (int s = 0; s < NUM_STREAMS; s++) { int idx = i * NUM_STREAMS + s; if (idx < elements.Length) { var vec = elements[idx]; //clear the buffer //set nonzero values to dense vector accessible thought vecIntPtr CudaHelpers.SetBufferIdx(vec, mainVecIntPtrs[s], 0.0f); float evalValue = ReduceOnHost(reduceIntPtrs[s], reductionBlocks); prediction[idx] = evalValue; } } } return(prediction); }