public static float[] CRSSparseMMwithDenseVector(int repetition, string moduleFunction, int blockSizeX, int blockSizeY) { CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //allocate memory for output IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); //create dense vector for each column in B matrix float[] mainVec = new float[maxIndex + 1]; uint memSize = (uint)((maxIndex + 1) * sizeof(float)); CUstream stream0 = cuda.CreateStream(); IntPtr[] mainVecIntPtrs = new IntPtr[2]; //write combined memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); // //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); mainVecIntPtrs[0] = cuda.AllocateHost(memSize); mainVecIntPtrs[1] = cuda.AllocateHost(memSize); CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0); //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); //mapped memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize); //get texture reference CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); int colIndexParamOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); int gridDim = (Rows + blockSizeX - 1) / blockSizeX; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int rep = 0; rep < repetition; rep++) { for (int k = 0; k < Cols; k++) { Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream0); cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0); cuda.SetParameter(cuFunc, colIndexParamOffset, (uint)k); cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer Helpers.SetBufferIdx(BIdx, BRowLen, k - 1, mainVecIntPtrs[(k + 1) % 2], 0.0f); //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr); ////make asynchronius copy and kernel lauch //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0); //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f); } } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Marshal.Copy(outputPtr2, output, 0, outputSize); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); cuda.DestroyStream(stream0); cuda.Free(mainVecPtr); cuda.DestroyTexture(cuTexRef); return(output); }
/// <summary> /// implementation of sparese matrix product /// </summary> /// <param name="repetition">how many times kernel should be launch</param> /// <param name="moduleFunction">cuda kenrel name</param> /// <param name="blockSizeX">block size X</param> /// <param name="blockSizeY">block size Y</param> /// <param name="transposeGrid">indicate that grid dimensions should be /// computed alternativly, if false than gridDimY- connected with rows /// else gridDim.Y conected with cols</param> /// <returns></returns> public static float[] CRSSparseMM(int repetition, string moduleFunction, int blockSizeX, int blockSizeY, bool transposeGrid) { //int blockSizeX = 4; //int blockSizeY = 4; CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); CUdeviceptr BValsPtr = cuda.CopyHostToDevice(BVals); CUdeviceptr BIdxPtr = cuda.CopyHostToDevice(BIdx); CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //CUdeviceptr dOutput = cuda.Allocate(output); IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; int Belements = BVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Aelements); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Belements); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); //cuda.SetTextureFlags(cuTexRef, 0); int gridDimX = (int)Math.Ceiling((Cols + 0.0) / (blockSizeX)); int gridDimY = (int)Math.Ceiling((0.0 + Rows) / blockSizeY); if (transposeGrid) { gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY); } Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int k = 0; k < repetition; k++) { cuda.Launch(cuFunc, gridDimX, gridDimY); cuda.SynchronizeContext(); // cuda.CopyDeviceToHost(dOutput, output); Marshal.Copy(outputPtr2, output, 0, outputSize); } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(BValsPtr); cuda.Free(BIdxPtr); cuda.Free(BLenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return(output); }
public override void Init() { linKernel.ProblemElements = problemElements; linKernel.Y = Y; linKernel.Init(); base.Init(); blockSize = threadsPerRow * sliceSize; int N = problemElements.Length; blockPerGrid = (int)Math.Ceiling(1.0 * N * threadsPerRow / blockSize); align = (int)Math.Ceiling(1.0 * sliceSize * threadsPerRow / 64) * 64; float[] vecVals; int[] vecColIdx; int[] vecLenght; int[] sliceStart; CudaHelpers.TransformToSlicedEllpack(out vecVals, out vecColIdx, out sliceStart, out vecLenght, problemElements, threadsPerRow, sliceSize); selfLinDot = linKernel.DiagonalDotCache; #region cudafy initialization InitCudaModule(); //copy data to device, set cuda function parameters valsPtr = gpu.CopyToDevice(vecVals); idxPtr = gpu.CopyToDevice(vecColIdx); vecLenghtPtr = gpu.CopyToDevice(vecLenght); sliceStartPtr = gpu.CopyToDevice(sliceStart); //!!!!! selfLinDotPtr = gpu.CopyToDevice(selfLinDot); labelsPtr = gpu.CopyToDevice(Y); //gpu.CopyToConstantMemory(new float[] { Gamma }, GammaDev); //float[] GammaDev =new float[] { Gamma }; //float[] GammaDevPtr = gpu.Allocate<float>(1); //gpu.CopyToConstantMemory<float>(GammaDev,GammaDevPtr); //float[] Gammas = new float[] { Gamma }; //float[] GammaDev = gpu.Allocate<float>(1); //gpu.CopyToConstantMemory<float>(Gammas, GammaDev); int memSize = (problemElements.Length * sizeof(float)); //allocate mapped memory for our results //outputIntPtr = gpu.HostAllocate<float>(problemElements.Length); // .HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //outputPtr = gpu.GetDeviceMemoryFromIntPtr(outputIntPtr);// cuda.GetHostDevicePointer(outputIntPtr, 0); outputIntPtr = cuGPU.HostAllocate((uint)memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); outputPtr = cuGPU.GetHostDevicePointer(outputIntPtr, 0); #endregion //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[problemElements[0].Dim + 1]; CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuGPU, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVectorPtr); //CudaHelpers.SetTextureMemory(cuGPU, ref cuLabelsTexRef, cudaLabelsTexRefName, Y, ref labelsPtr); }
private void SetCudaData(Problem <SparseVec> sub_prob) { int vecDim = sub_prob.Elements[0].Dim; /* * copy vectors to CUDA device */ float[] vecVals; int[] vecIdx; int[] vecLenght; CudaHelpers.TransformToCSRFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSRPtr = cuda.CopyHostToDevice(vecVals); idxCSRPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSRPtr = cuda.CopyHostToDevice(vecLenght); CudaHelpers.TransformToCSCFormat(out vecVals, out vecIdx, out vecLenght, sub_prob.Elements); valsCSCPtr = cuda.CopyHostToDevice(vecVals); idxCSCPtr = cuda.CopyHostToDevice(vecIdx); vecLenghtCSCPtr = cuda.CopyHostToDevice(vecLenght); /* * allocate memory for gradient */ uint memSize = (uint)(sub_prob.ElementsCount * sizeof(float)); //allocate mapped memory for our results (dot product beetween vector W and all elements) gradIntPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); gradPtr = cuda.GetHostDevicePointer(gradIntPtr, 0); //allocate memory for main vector, size of this vector is the same as dimenson, so many //indexes will be zero, but cuda computation is faster mainVector = new float[vecDim]; //move W wector //CudaHelpers.FillDenseVector(problemElements[0], mainVector); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuMainVecTexRef, cudaMainVecTexRefName, mainVector, ref mainVecPtr); //set texture memory for labels CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuLabelsTexRef, cudaLabelsTexRefName, sub_prob.Y, ref labelsPtr); /* * data for cuda solver */ //normaly for L2 solver QDii= xii*xii+Diag_i //where Diag_i = 0.5/Cp if yi=1 // Diag_i = 0.5/Cn if yi=-1 //but we will add this on GPU QD = new float[sub_prob.ElementsCount]; alpha = new float[sub_prob.ElementsCount]; deltas = new float[sub_prob.ElementsCount]; float[] diag = new float[3]; for (int i = 0; i < sub_prob.ElementsCount; i++) { QD[i] = sub_prob.Elements[i].DotProduct(); alpha[i] = 0f; deltas[i] = 0; } qdPtr = cuda.CopyHostToDevice(QD); alphaPtr = cuda.Allocate(alpha); //deltasPtr = cuda.Allocate(deltas); CudaHelpers.SetTextureMemory(cuda, cuModule, ref cuDeltasTexRef, "deltasTexRef", deltas, ref deltasPtr); diagPtr = cuda.GetModuleGlobal(cuModule, "diag_shift"); //set this in fill function //cuda.CopyHostToDevice(diagPtr, diag); //CUdeviceptr dimPtr = cuda.GetModuleGlobal(cuModule, "Dim"); ////todo: check if it ok ////cuda.Memset(dimPtr,(uint) vecDim, 1); //int[] dimArr = new int[] { vecDim }; //cuda.CopyHostToDevice(dimPtr,dimArr); //CUDARuntime.cudaMemcpyToSymbol("Dim", dimPtr, 1, 0, cudaMemcpyKind.cudaMemcpyHostToDevice); //CUDARuntime.cudaMemcpyToSymbol("Dim", ,1,0, cudaMemcpyKind.cudaMemcpyHostToDevice); CUdeviceptr deltaScalingPtr = cuda.GetModuleGlobal(cuModule, "stepScaling"); //two ways of computing scaling param, should be the same, but it depends on rounding. //stepScaling = (float)(1.0 / Math.Sqrt(sub_prob.ElementsCount)); stepScaling = 0.0002f;// (float)(1.0 / sub_prob.ElementsCount); //set scaling constant float[] scArr = new float[] { stepScaling }; cuda.CopyHostToDevice(deltaScalingPtr, scArr); //cuda.Memset(deltaScalingPtr, (uint) scaling,sizeof(float)); //cuda.CopyHostToDevice(dimPtr, problem.Elements[0].Dim); SetCudaParameters(sub_prob); }
/// <summary> /// implementation of sparese matrix product /// </summary> /// <param name="repetition">how many times kernel should be launch</param> /// <param name="moduleFunction">cuda kenrel name</param> /// <param name="blockSizeX">block size X</param> /// <param name="blockSizeY">block size Y</param> /// <param name="transposeGrid">indicate that grid dimensions should be /// computed alternativly, if false than gridDimY- connected with rows /// else gridDim.Y conected with cols</param> /// <returns></returns> public static float[] CRSSparseMM(int repetition, string moduleFunction, int blockSizeX,int blockSizeY, bool transposeGrid) { //int blockSizeX = 4; //int blockSizeY = 4; CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen,out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen,out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); CUdeviceptr BValsPtr = cuda.CopyHostToDevice(BVals); CUdeviceptr BIdxPtr = cuda.CopyHostToDevice(BIdx); CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //CUdeviceptr dOutput = cuda.Allocate(output); IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; int Belements = BVals.Length; cuda.SetFunctionBlockShape(cuFunc,blockSizeX,blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Aelements); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Belements); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); //cuda.SetTextureFlags(cuTexRef, 0); int gridDimX =(int) Math.Ceiling((Cols + 0.0) / (blockSizeX)); int gridDimY = (int)Math.Ceiling((0.0+Rows)/blockSizeY); if (transposeGrid) { gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY); } Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int k = 0; k < repetition; k++) { cuda.Launch(cuFunc, gridDimX, gridDimY); cuda.SynchronizeContext(); // cuda.CopyDeviceToHost(dOutput, output); Marshal.Copy(outputPtr2, output, 0, outputSize); } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Console.WriteLine("Matrix products with kernel {0}",moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(BValsPtr); cuda.Free(BIdxPtr); cuda.Free(BLenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return output; }
public static float[] CRSSparseMMwithDenseVector(int repetition, string moduleFunction, int blockSizeX, int blockSizeY) { CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //allocate memory for output IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); //create dense vector for each column in B matrix float[] mainVec = new float[maxIndex + 1]; uint memSize = (uint)((maxIndex + 1) * sizeof(float)); CUstream stream0 =cuda.CreateStream(); IntPtr[] mainVecIntPtrs= new IntPtr[2]; //write combined memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); // //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); mainVecIntPtrs[0] = cuda.AllocateHost(memSize); mainVecIntPtrs[1] = cuda.AllocateHost(memSize); CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0); //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); //mapped memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize); //get texture reference CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); int colIndexParamOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); int gridDim= (Rows + blockSizeX - 1) / blockSizeX; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int rep = 0; rep < repetition; rep++) { for (int k = 0; k < Cols; k++) { Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream0); cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0); cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer Helpers.SetBufferIdx(BIdx, BRowLen, k-1, mainVecIntPtrs[(k+1) % 2], 0.0f); //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr); ////make asynchronius copy and kernel lauch //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0); //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f); } } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Marshal.Copy(outputPtr2, output, 0, outputSize); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); cuda.DestroyStream(stream0); cuda.Free(mainVecPtr); cuda.DestroyTexture(cuTexRef); return output; }
//private static void InitMainVector(float[] vecVals, int[] vecIdx, int[] vecLenght, float[] mainVec) //{ // for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++) // { // int idx = vecIdx[j]; // float val = vecVals[j]; // mainVec[idx] = val; // } //} private static float[] CuDotProdCSRwriteCombined(int repetition) { //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction cuFunc = cuda.GetModuleFunction("spmv_csr_vector_kernel_wc"); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); //temp lists for values, indices and vecotr lenght List<float> vecValsL = new List<float>(N * maxRowSize / 2); List<int> vecIdxL = new List<int>(N * maxRowSize / 2); List<int> vecLenghtL = new List<int>(N); float[] vecVals; int[] vecIdx; int[] vecLenght; maxIndex = 0; int vecStartIdx = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); vecValsL.AddRange(vals); int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); vecIdxL.AddRange(index); vecLenghtL.Add(vecStartIdx); vecStartIdx += vecSize; } //for last index vecLenghtL.Add(vecStartIdx); vecVals = vecValsL.ToArray(); vecIdx = vecIdxL.ToArray(); vecLenght = vecLenghtL.ToArray(); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght); float[] output = new float[N]; //CUdeviceptr dOutput = cuda.Allocate(output); IntPtr outputPtr2 = cuda.HostAllocate((uint)(N * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); uint memSize = (uint)((maxIndex + 1) * sizeof(float)); uint flags = CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP | CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED; uint tt = (uint)CUMemHostAllocFlags.WriteCombined; uint s = (uint)CUMemHostAllocFlags.DeviceMap; IntPtr mainVecIntPtr = cuda.HostAllocate(memSize, flags); CUdeviceptr mainVecPtr = cuda.GetHostDevicePointer(mainVecIntPtr, 0); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters cuda.SetFunctionBlockShape(cuFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, mainVecPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)vecStartIdx); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); mainIndex = StartingIndex; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int k = 0; k < repetition; k++) { //float[] tempFloatarr = new float[memSize]; Helpers.InitBuffer(vecVals, vecIdx, vecLenght,mainIndex, mainVecIntPtr); //Marshal.Copy(mainVecIntPtr, tempFloatarr, 0, tempFloatarr.Length); cuda.Launch(cuFunc, blocksPerGrid, 1); cuda.SynchronizeContext(); //cuda.CopyDeviceToHost(dOutput, output); Marshal.Copy(outputPtr2, output, 0, N); //mainVec = new float[maxIndex + 1]; //Array.Clear(mainVec, 0, mainVec.Length); //clear previous vector values Helpers.SetBufferIdx(vecIdx, vecLenght,mainIndex, mainVecIntPtr,0.0f); mainIndex++; } cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); // cuda.CopyDeviceToHost(dOutput, output); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.Free(vecLenghtPtr); //cuda.DestroyArray(cuArr); cuda.Free(mainVecPtr); //cuda.DestroyTexture(cuTexRef); // cuda.Free(mainVecPtr); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return output; }