public override void Init() { //it's not necessary to init linear kernel its used only for computing element product //linKernel.ProblemElements = problemElements; //linKernel.Labels = Labels; //linKernel.Init(); base.Init(); float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, problemElements); #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); uint memSize = (uint)(problemElements.Length * sizeof(float)); //allocate mapped memory for our results outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); //get reference to cuda texture for main vector //cuMainVecTexRef = cuda.GetModuleTexture(cuModule, cudaMainVecTexRefName); //mainVecPtr = cuda.CopyHostToDevice(mainVector); //cuda.SetTextureAddress(cuMainVecTexRef, mainVecPtr, (uint)(sizeof(float) * mainVector.Length)); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); //cuLabelsTexRef = cuda.GetModuleTexture(cuModule, cudaLabelsTexRefName); //labelsPtr = cuda.CopyHostToDevice(Labels); //uint align = cuda.SetTextureAddress(cuLabelsTexRef, labelsPtr, (uint)(sizeof(float) * Labels.Length)); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }
public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, problemElements); selfLinDot = linKernel.DiagonalDotCache; #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); //!!!!! selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot); uint memSize = (uint)(problemElements.Length * sizeof(float)); //allocate mapped memory for our results outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimension, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }
/// <summary> /// Predicts the specified elements. /// </summary> /// <param name="elements">The elements.</param> /// <returns>array of predicted labels +1 or -1</returns> public override float[] Predict(SparseVec[] elements) { if (!IsInitialized) { throw new ApplicationException("Evaluator is not initialized. Call init method"); } //tranfsorm elements to matrix in CSR format // elements values float[] vecVals; //elements indexes int[] vecIdx; //elements lenght int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, elements); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtPtr = cuda.CopyHostToDevice(vecLenght); //release arrays vecVals = null; vecIdx = null; vecLenght = null; uint memElementsSize = (uint)(elements.Length * sizeof(float)); //allocate mapped memory for our results outputIntPtr = cuda.HostAllocate(memElementsSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //outputPtr = cuda.Allocate(memElementsSize); // Set the cuda kernel paramerters #region set cuda parameters uint Rows = (uint)elements.Length; uint Cols = (uint)TrainedModel.SupportElements.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; //set elements param cuda.SetParameter(cuFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer); offset += IntPtr.Size; //set labels param cuda.SetParameter(cuFunc, offset, labelsPtr.Pointer); offset += IntPtr.Size; //set alphas param cuda.SetParameter(cuFunc, offset, alphasPtr.Pointer); offset += IntPtr.Size; //set output (reslut) param cuda.SetParameter(cuFunc, offset, outputPtr.Pointer); offset += IntPtr.Size; //set number of elements param cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); //set number of support vectors param cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); //set support vector index param lastParameterOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); for (int k = 0; k < TrainedModel.SupportElements.Length; k++) { //set the buffer values from k-th support vector CudaHelpers.InitBuffer(TrainedModel.SupportElements[k], svVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream); //copy asynchronously from buffer to devece cuda.CopyHostToDeviceAsync(mainVecPtr, svVecIntPtrs[k % 2], memSvSize, stream); //set the last parameter in kernel (column index) // colIndexParamOffset cuda.SetParameter(cuFunc, lastParameterOffset, (uint)k); //launch kernl cuda.LaunchAsync(cuFunc, gridDimX, 1, stream); if (k > 0) { //clear the previous host buffer CudaHelpers.SetBufferIdx(TrainedModel.SupportElements[k - 1], svVecIntPtrs[(k + 1) % 2], 0.0f); } } //CUdeviceptr symbolAdr; //CUDARuntime.cudaGetSymbolAddress(ref symbolAdr,"RHO"); rho = TrainedModel.Bias; //IntPtr symbolVal = new IntPtr(&rho); //CUDARuntime.cudaMemcpyToSymbol("RHO", symbolVal, 1, 1, cudaMemcpyKind.cudaMemcpyHostToDevice); cuda.SetFunctionBlockShape(cuFuncSign, blockSizeX, blockSizeY, 1); int signFuncOffset = 0; //set array param cuda.SetParameter(cuFuncSign, signFuncOffset, outputPtr.Pointer); signFuncOffset += IntPtr.Size; //set size cuda.SetParameter(cuFuncSign, signFuncOffset, Rows); signFuncOffset += sizeof(int); cuda.SetParameter(cuFuncSign, signFuncOffset, rho); signFuncOffset += sizeof(float); cuda.SetParameterSize(cuFuncSign, (uint)signFuncOffset); //gridDimX is valid for this function cuda.LaunchAsync(cuFuncSign, gridDimX, 1, stream); //wait for all computation cuda.SynchronizeContext(); float[] result = new float[elements.Length]; //copy result Marshal.Copy(outputIntPtr, result, 0, elements.Length); return(result); }