private void SetCudaData() { CudaHelpers.GetNumThreadsAndBlocks(problemSize, maxReductionBlocks, threadsPerBlock, ref reductionThreads, ref reductionBlocks); alphaPtr = cuda.CopyHostToDevice(alpha); gradPtr = cuda.CopyHostToDevice(G); yPtr = cuda.CopyHostToDevice(y); kernelDiagPtr = cuda.CopyHostToDevice(QD); //kernel columns i,j is simpler to copy array of zeros kiPtr = cuda.CopyHostToDevice(alpha); kjPtr = cuda.CopyHostToDevice(alpha); //todo:remove it int redSize = reductionThreads; //reductionBlocks reduceVal = new float[redSize]; reduceIdx = new int[redSize]; valRedPtr = cuda.CopyHostToDevice(reduceVal); idxRedPtr = cuda.CopyHostToDevice(reduceIdx); constCPtr = cuda.GetModuleGlobal(cuModule, "C"); float[] cData = new float[] { C }; cuda.CopyHostToDevice(constCPtr, cData); SetCudaParams(); }
private void SetCudaData() { CudaHelpers.GetNumThreadsAndBlocks(problemSize, maxReductionBlocks, threadsPerBlock, ref reductionThreads, ref reductionBlocks); alphaPtr = cuda.CopyHostToDevice(alpha); gradPtr = cuda.CopyHostToDevice(G); yPtr = cuda.CopyHostToDevice(y); //kernel columns i,j is simpler to copy array of zeros uint memSize = (uint)(sizeof(float) * problemSize * 2); kiPtr = cuda.Allocate(memSize); kjPtr = kiPtr + sizeof(float) * problemSize; //todo:remove it int redSize = reductionThreads; //reductionBlocks reduceVal = new float[redSize * 2]; reduceIdx = new int[redSize * 2]; valRedPtr = cuda.CopyHostToDevice(reduceVal); idxRedPtr = cuda.CopyHostToDevice(reduceIdx); constCPtr = cuda.GetModuleGlobal(cuModule, "C"); float[] cData = new float[] { C }; cuda.CopyHostToDevice(constCPtr, cData); constBPtr = cuda.GetModuleGlobal(cuModule, "B"); B = new float[] { 0, 0, C }; cuda.CopyHostToDevice(constBPtr, B); constAPtr = cuda.GetModuleGlobal(cuModule, "A"); A = new float[] { -C, 0, 0 }; cuda.CopyHostToDevice(constAPtr, A); SetCudaParams(); }
private void SetCudaData(Problem <SparseVec> sub_prob) { int vecDim = sub_prob.FeaturesCount;//.Elements[0].Dim; /* * copy vectors to CUDA device */ #region copy trainning examples to GPU float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSRPtr = cuda.CopyHostToDevice(vecVals); idxCSRPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght); CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSCPtr = cuda.CopyHostToDevice(vecVals); idxCSCPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght); #endregion /* * allocate memory for gradient */ alphaMemSize = (uint)(sub_prob.ElementsCount * sizeof(float)); gradPtr = cuda.Allocate(alphaMemSize); gradOldPtr = cuda.Allocate(alphaMemSize); alphaPtr = cuda.Allocate(alphaMemSize); alphaOldPtr = cuda.Allocate(alphaMemSize); alphaTmpPtr = cuda.Allocate(alphaMemSize); /* * reduction blocks for computing Obj */ GetNumThreadsAndBlocks(vecDim, 64, threadsPerBlock, ref threadsForReduceObjW, ref bpgReduceW); reduceObjW = new float[bpgReduceW]; uint reduceWBytes = (uint)bpgReduceW * sizeof(float); reduceObjWPtr = cuda.Allocate(reduceWBytes); /* * reduction size for kernels which operate on alpha */ int reductionSize = problem.ElementsCount; threadsForReduceObjAlpha = 0; GetNumThreadsAndBlocks(problem.ElementsCount, 64, threadsPerBlock, ref threadsForReduceObjAlpha, ref bpgReduceAlpha); uint alphaReductionBytes = (uint)bpgReduceAlpha * sizeof(float); /* * reduction array for computing objective function value */ reduceObjAlpha = new float[bpgReduceAlpha]; reduceObjAlphaPtr = cuda.Allocate(alphaReductionBytes); /* * reduction arrays for computing BB step */ alphaPartReduce = new float[bpgReduceAlpha]; gradPartReduce = new float[bpgReduceAlpha]; alphaGradPartReduce = new float[bpgReduceAlpha]; reduceBBAlphaGradPtr = cuda.Allocate(alphaReductionBytes); reduceBBAlphaPtr = cuda.Allocate(alphaReductionBytes); reduceBBGradPtr = cuda.Allocate(alphaReductionBytes); /* * reduction arrays for comuting lin part */ reduceLinPart = new float[bpgReduceAlpha]; reduceLinPartPtr = cuda.Allocate(alphaReductionBytes); //float[] wVec = new float[vecDim]; wVecMemSize = (uint)vecDim * sizeof(float); wTempVecPtr = cuda.Allocate(wVecMemSize); //move W wector SetTextureMemory(ref cuWVecTexRef, cudaWVecTexRefName, ref wVecPtr, wVecMemSize); //set texture memory for labels SetTextureMemory(ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr); SetTextureMemory(ref cuDeltasTexRef, "deltasTexRef", ref deltasPtr, alphaMemSize); diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift"); stepBBPtr = cuda.GetModuleGlobal(cuModule, "stepBB"); float[] stepData = new float[] { 0.1f }; cuda.CopyHostToDevice(stepBBPtr, stepData); SetCudaParameters(sub_prob); }
private void SetCudaData(Problem <SparseVec> sub_prob) { int vecDim = sub_prob.Elements[0].Dim; /* * copy vectors to CUDA device */ float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSRPtr = cuda.CopyHostToDevice(vecVals); idxCSRPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght); CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSCPtr = cuda.CopyHostToDevice(vecVals); idxCSCPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght); /* * allocate memory for gradient */ uint memSize = (uint)(sub_prob.ElementsCount * sizeof(float)); //allocate mapped memory for our results (dot product beetween vector W and all elements) gradIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); gradPtr = cuda.GetHostDevicePointer(gradIntPtr, 0); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[vecDim]; //move W wector //CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); //set texture memory for labels CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr); /* * data for cuda solver */ //normaly for L2 solver QDii= xii*xii+Diag_i //where Diag_i = 0.5/Cp if yi=1 // Diag_i = 0.5/Cn if yi=-1 //but we will add this on GPU QD = new float[sub_prob.ElementsCount]; alpha = new float[sub_prob.ElementsCount]; deltas = new float[sub_prob.ElementsCount]; float[] diag = new float[3]; for (int i = 0; i < sub_prob.ElementsCount; i++) { QD[i] = sub_prob.Elements[i].DotProduct(); alpha[i] = 0f; deltas[i] = 0; } qdPtr = cuda.CopyHostToDevice(QD); alphaPtr = cuda.Allocate(alpha); //deltasPtr = cuda.Allocate(deltas); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuDeltasTexRef, "deltasTexRef", deltas, ref deltasPtr); diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift"); //set this in fill function //cuda.CopyHostToDevice(diagPtr, diag); //CUdeviceptr dimPtr = cuda.GetModuleGlobal(cuModule, "Dim"); ////todo: check if it ok ////cuda.Memset(dimPtr,(uint) vecDim, 1); //int[] dimArr = new int[] { vecDim }; //cuda.CopyHostToDevice(dimPtr,dimArr); //CUDARuntime.cudaMemcpyToSymbol("Dim", dimPtr, 1, 0, cudaMemcpyKind.cudaMemcpyHostToDevice); //CUDARuntime.cudaMemcpyToSymbol("Dim", ,1,0, cudaMemcpyKind.cudaMemcpyHostToDevice); CUdeviceptr deltaScalingPtr = cuda.GetModuleGlobal(cuModule, "stepScaling"); //two ways of computing scaling param, should be the same, but it depends on rounding. //stepScaling = (float)(1.0 / Math.Sqrt(sub_prob.ElementsCount)); stepScaling = 0.0002f;// (float)(1.0 / sub_prob.ElementsCount); //set scaling constant float[] scArr = new float[] { stepScaling }; cuda.CopyHostToDevice(deltaScalingPtr, scArr); //cuda.Memset(deltaScalingPtr, (uint) scaling,sizeof(float)); //cuda.CopyHostToDevice(dimPtr, problem.Elements[0].Dim); SetCudaParameters(sub_prob); }