public override void Init() { base.Init(); float[] vecVals; int[] vecColIdx; int[] vecLenght; //change the blocksPerGrid, because we launch many threads per row blocksPerGrid = (int)Math.Ceiling((ThreadsPerRow * problemElements.Length + 0.0) / threadsPerBlock); int align = ThreadsPerRow * Prefetch; CudaHelpers.TransformToERTILPFormat(out vecVals, out vecColIdx, out vecLenght, problemElements, align, ThreadsPerRow); selfSum = problemElements.AsParallel().Select(x => x.Values.Sum()).ToArray(); #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); selfSumPtr = cuda.CopyHostToDevice(selfSum); uint memSize = (uint)(problemElements.Length * sizeof(float)); outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimension, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); if (MakeDenseVectorOnGPU) { vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim); vecBuilder.Init(); } }
public override void Init() { base.Init(); blockSize = threadsPerRow * sliceSize; int N = problemElements.Length; blocksPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize); align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64; float[] vecVals; int[] vecColIdx; int[] vecLenght; int[] sliceStart; CudaHelpers.TransformToSERTILP(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize,preFechSize); selfSum = problemElements.AsParallel().Select(x => x.Values.Sum()).ToArray(); #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); sliceStartPtr = cuda.CopyHostToDevice(sliceStart); labelsPtr = cuda.CopyHostToDevice(Y); selfSumPtr = cuda.CopyHostToDevice(selfSum); uint memSize = (uint)(problemElements.Length * sizeof(float)); outputIntPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimension, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); // CudaHelpers.SetTextureMemory(cuda,cuModule,ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }
public override void Init() { //it's not necessary to init linear kernel its used only for computing element product //linKernel.ProblemElements = problemElements; //linKernel.Labels = Labels; //linKernel.Init(); base.Init(); float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, problemElements); #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); uint memSize = (uint)(problemElements.Length * sizeof(float)); //allocate mapped memory for our results outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); //get reference to cuda texture for main vector //cuMainVecTexRef = cuda.GetModuleTexture(cuModule, cudaMainVecTexRefName); //mainVecPtr = cuda.CopyHostToDevice(mainVector); //cuda.SetTextureAddress(cuMainVecTexRef, mainVecPtr, (uint)(sizeof(float) * mainVector.Length)); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); //cuLabelsTexRef = cuda.GetModuleTexture(cuModule, cudaLabelsTexRefName); //labelsPtr = cuda.CopyHostToDevice(Labels); //uint align = cuda.SetTextureAddress(cuLabelsTexRef, labelsPtr, (uint)(sizeof(float) * Labels.Length)); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }
public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, problemElements); selfLinDot = linKernel.DiagonalDotCache; #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); //!!!!! selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot); uint memSize = (uint)(problemElements.Length * sizeof(float)); //allocate mapped memory for our results outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimension, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }
public override void Init() { base.Init(); float[] vecVals; int[] vecColIdx; int[] vecLenght; CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements); selfSum = problemElements.AsParallel().Select(x => x.Values.Sum()).ToArray(); #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); selfSumPtr = cuda.CopyHostToDevice(selfSum); uint memSize = (uint)(problemElements.Length * sizeof(float)); //allocate mapped memory for our results //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost); outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); if (MakeDenseVectorOnGPU) { vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim); vecBuilder.Init(); } }
public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); float[] vecVals; int[] vecColIdx; int[] vecLenght; CudaHelpers.TransformToEllpackRFormat(out vecVals, out vecColIdx, out vecLenght, problemElements); selfLinDot = linKernel.DiagonalDotCache; #region cuda initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = cuda.CopyHostToDevice(vecVals); idxPtr = cuda.CopyHostToDevice(vecColIdx); vecLengthPtr = cuda.CopyHostToDevice(vecLenght); selfLinDotPtr = cuda.CopyHostToDevice(selfLinDot); uint memSize = (uint)(problemElements.Length * sizeof(float)); //allocate mapped memory for our results //CUDARuntime.cudaSetDeviceFlags(CUDARuntime.cudaDeviceMapHost); // var e= CUDADriver.cuMemHostAlloc(ref outputIntPtr, memSize, 8); //CUDARuntime.cudaHostAlloc(ref outputIntPtr, memSize, CUDARuntime.cudaHostAllocMapped); //var errMsg=CUDARuntime.cudaGetErrorString(e); //cuda.HostRegister(outputIntPtr,memSize, Cuda) outputIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuda.GetHostDevicePointer(outputIntPtr, 0); //normal memory allocation //outputPtr = cuda.Allocate((uint)(sizeof(float) * problemElements.Length)); #endregion SetCudaFunctionParameters(); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); if (MakeDenseVectorOnGPU) { vecBuilder = new EllpackDenseVectorBuilder(cuda, mainVecPtr, valsPtr, idxPtr, vecLengthPtr, problemElements.Length, problemElements[0].Dim); vecBuilder.Init(); } }
public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); blockSize = threadsPerRow * sliceSize; int N = problemElements.Length; blockPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize); align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64; float[] vecVals; int[] vecColIdx; int[] vecLenght; int[] sliceStart; CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize); selfLinDot = linKernel.DiagonalDotCache; #region cudafy initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = gpu.CopyToDevice(vecVals); idxPtr = gpu.CopyToDevice(vecColIdx); vecLenghtPtr = gpu.CopyToDevice(vecLenght); sliceStartPtr = gpu.CopyToDevice(sliceStart); //!!!!! selfLinDotPtr = gpu.CopyToDevice(selfLinDot); labelsPtr = gpu.CopyToDevice(Y); //gpu.CopyToConstantMemory(new float[] { Gamma }, GammaDev); //float[] GammaDev =new float[] { Gamma }; //float[] GammaDevPtr = gpu.Allocate<float>(1); //gpu.CopyToConstantMemory<float>(GammaDev,GammaDevPtr); //float[] Gammas = new float[] { Gamma }; //float[] GammaDev = gpu.Allocate<float>(1); //gpu.CopyToConstantMemory<float>(Gammas, GammaDev); int memSize = (problemElements.Length * sizeof(float)); //allocate mapped memory for our results //outputIntPtr = gpu.HostAllocate<float>(problemElements.Length); // .HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //outputPtr = gpu.GetDeviceMemoryFromIntPtr(outputIntPtr);// cuda.GetHostDevicePointer(outputIntPtr, 0); outputIntPtr = cuGPU.HostAllocate((uint)memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuGPU.GetHostDevicePointer(outputIntPtr, 0); #endregion //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuGPU, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVectorPtr); //CudaHelpers.SetTextureMemory(cuGPU, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }