public override void Init() { base.Init(); blockSize = threadsPerRow * sliceSize; int N = problemElements.Length; blocksPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize); align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64; float[] vecVals; int[] vecColIdx; int[] vecLenght; int[] sliceStart; CudaHelpers.TransformToSERTILP(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize,preFechSize); selfSum = problemElements.AsParallel().Select(x => x.Values.Sum()).ToArray(); #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); sliceStartPtr = cuda.CopyHostToDevice(sliceStart); labelsPtr = cuda.CopyHostToDevice(Y); selfSumPtr = cuda.CopyHostToDevice(selfSum); uint memSize = (uint)(problemElements.Length * sizeof(float)); outputIntPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimension, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); // CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }