/// <summary> /// Dispose all object used by CUDA /// </summary> private void DisposeCuda() { if (cuda != null) { //free all resources cuda.Free(valsCSRPtr); cuda.Free(valsCSCPtr); valsCSRPtr.Pointer = IntPtr.Zero; valsCSCPtr.Pointer = IntPtr.Zero; cuda.Free(idxCSRPtr); cuda.Free(idxCSCPtr); idxCSRPtr.Pointer = IntPtr.Zero; idxCSCPtr.Pointer = IntPtr.Zero; cuda.Free(vecLenghtCSRPtr); cuda.Free(vecLenghtCSCPtr); vecLenghtCSRPtr.Pointer = IntPtr.Zero; vecLenghtCSCPtr.Pointer = IntPtr.Zero; cuda.Free(qdPtr); qdPtr.Pointer = IntPtr.Zero; // cuda.Free(diagPtr); diagPtr.Pointer = IntPtr.Zero; cuda.Free(alphaPtr); alphaPtr.Pointer = IntPtr.Zero; cuda.Free(gradPtr); gradPtr.Pointer = IntPtr.Zero; cuda.Free(deltasPtr); deltasPtr.Pointer = IntPtr.Zero; cuda.DestroyTexture(cuDeltasTexRef); cuda.Free(labelsPtr); labelsPtr.Pointer = IntPtr.Zero; cuda.DestroyTexture(cuLabelsTexRef); cuda.Free(mainVecPtr); mainVecPtr.Pointer = IntPtr.Zero; cuda.DestroyTexture(cuMainVecTexRef); cuda.UnloadModule(cuModule); cuda.Dispose(); cuda = null; } }
protected void DisposeResourses() { for (int i = 0; i < NUM_STREAMS; i++) { cuda.FreeHost(mainVecIntPtrs[i]); cuda.Free(mainVecCuPtr[i]); cuda.Free(evalOutputCuPtr[i]); cuda.Free(reduceCuPtr[i]); cuda.FreeHost(reduceIntPtrs[i]); if (cuVecTexRef[i].Pointer != IntPtr.Zero) { cuda.DestroyTexture(cuVecTexRef[i]); } cuda.DestroyStream(stream[i]); } if (labelsPtr.Pointer != IntPtr.Zero) { cuda.Free(labelsPtr); labelsPtr.Pointer = IntPtr.Zero; } if (alphasPtr.Pointer != IntPtr.Zero) { cuda.Free(alphasPtr); alphasPtr.Pointer = IntPtr.Zero; } }
protected void DisposeResourses() { //free all resources cuda.Free(valsPtr); valsPtr.Pointer = IntPtr.Zero; cuda.Free(idxPtr); idxPtr.Pointer = IntPtr.Zero; cuda.Free(vecLengthPtr); vecLengthPtr.Pointer = IntPtr.Zero; cuda.FreeHost(outputIntPtr); //if (outputPtr.Pointer != IntPtr.Zero) //{ // cuda.Free(outputPtr); // outputPtr.Pointer = IntPtr.Zero; //} cuda.Free(labelsPtr); labelsPtr.Pointer = IntPtr.Zero; if (VecIPtr.Pointer != IntPtr.Zero) { cuda.Free(VecIPtr); VecIPtr.Pointer = IntPtr.Zero; } if (VecJPtr.Pointer != IntPtr.Zero) { cuda.Free(VecJPtr); VecJPtr.Pointer = IntPtr.Zero; } if (cuVecI_TexRef.Pointer != IntPtr.Zero) { cuda.DestroyTexture(cuVecI_TexRef); } if (cuVecJ_TexRef.Pointer != IntPtr.Zero) { cuda.DestroyTexture(cuVecJ_TexRef); } }
private void DisposeCuda() { if (cuda != null) { //free all resources cuda.Free(valsCSRPtr); cuda.Free(valsCSCPtr); valsCSRPtr.Pointer = IntPtr.Zero; valsCSCPtr.Pointer = IntPtr.Zero; cuda.Free(idxCSRPtr); cuda.Free(idxCSCPtr); idxCSRPtr.Pointer = IntPtr.Zero; idxCSCPtr.Pointer = IntPtr.Zero; cuda.Free(vecLenghtCSRPtr); cuda.Free(vecLenghtCSCPtr); vecLenghtCSRPtr.Pointer = IntPtr.Zero; vecLenghtCSCPtr.Pointer = IntPtr.Zero; cuda.Free(gradPtr); gradPtr.Pointer = IntPtr.Zero; cuda.Free(gradOldPtr); gradOldPtr.Pointer = IntPtr.Zero; cuda.Free(alphaPtr); alphaPtr.Pointer = IntPtr.Zero; cuda.Free(alphaTmpPtr); alphaTmpPtr.Pointer = IntPtr.Zero; cuda.Free(alphaOldPtr); alphaOldPtr.Pointer = IntPtr.Zero; cuda.Free(wVecPtr); wVecPtr.Pointer = IntPtr.Zero; cuda.Free(wTempVecPtr); wTempVecPtr.Pointer = IntPtr.Zero; cuda.Free(reduceBBAlphaPtr); reduceBBAlphaPtr.Pointer = IntPtr.Zero; cuda.Free(reduceBBGradPtr); reduceBBGradPtr.Pointer = IntPtr.Zero; cuda.Free(reduceBBAlphaGradPtr); reduceBBAlphaGradPtr.Pointer = IntPtr.Zero; cuda.Free(reduceObjAlphaPtr); reduceObjAlphaPtr.Pointer = IntPtr.Zero; cuda.Free(reduceObjWPtr); reduceObjWPtr.Pointer = IntPtr.Zero; cuda.Free(reduceGradMaxNormPtr); reduceGradMaxNormPtr.Pointer = IntPtr.Zero; //cuda.Free(diagPtr); //diagPtr.Pointer =IntPtr.Zero; //cuda.Free(stepBBPtr); //stepBBPtr.Pointer =IntPtr.Zero; cuda.Free(deltasPtr); deltasPtr.Pointer = IntPtr.Zero; cuda.DestroyTexture(cuDeltasTexRef); cuda.Free(labelsPtr); labelsPtr.Pointer = IntPtr.Zero; cuda.DestroyTexture(cuLabelsTexRef); cuda.DestroyTexture(cuWVecTexRef); cuda.UnloadModule(cuModule); cuda.Dispose(); cuda = null; } }
public static float[] CRSSparseMMwithDenseVector(int repetition, string moduleFunction, int blockSizeX, int blockSizeY) { CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //allocate memory for output IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); //create dense vector for each column in B matrix float[] mainVec = new float[maxIndex + 1]; uint memSize = (uint)((maxIndex + 1) * sizeof(float)); CUstream stream0 = cuda.CreateStream(); IntPtr[] mainVecIntPtrs = new IntPtr[2]; //write combined memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); // //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); mainVecIntPtrs[0] = cuda.AllocateHost(memSize); mainVecIntPtrs[1] = cuda.AllocateHost(memSize); CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0); //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); //mapped memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize); //get texture reference CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); int colIndexParamOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); int gridDim = (Rows + blockSizeX - 1) / blockSizeX; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int rep = 0; rep < repetition; rep++) { for (int k = 0; k < Cols; k++) { Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream0); cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0); cuda.SetParameter(cuFunc, colIndexParamOffset, (uint)k); cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer Helpers.SetBufferIdx(BIdx, BRowLen, k - 1, mainVecIntPtrs[(k + 1) % 2], 0.0f); //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr); ////make asynchronius copy and kernel lauch //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0); //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f); } } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Marshal.Copy(outputPtr2, output, 0, outputSize); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); cuda.DestroyStream(stream0); cuda.Free(mainVecPtr); cuda.DestroyTexture(cuTexRef); return(output); }
public static float[] CRSSparseMMwithDenseVector(int repetition, string moduleFunction, int blockSizeX, int blockSizeY) { CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //allocate memory for output IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); //create dense vector for each column in B matrix float[] mainVec = new float[maxIndex + 1]; uint memSize = (uint)((maxIndex + 1) * sizeof(float)); CUstream stream0 =cuda.CreateStream(); IntPtr[] mainVecIntPtrs= new IntPtr[2]; //write combined memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); // //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); mainVecIntPtrs[0] = cuda.AllocateHost(memSize); mainVecIntPtrs[1] = cuda.AllocateHost(memSize); CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0); //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); //mapped memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize); //get texture reference CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); int colIndexParamOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); int gridDim= (Rows + blockSizeX - 1) / blockSizeX; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int rep = 0; rep < repetition; rep++) { for (int k = 0; k < Cols; k++) { Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream0); cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0); cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer Helpers.SetBufferIdx(BIdx, BRowLen, k-1, mainVecIntPtrs[(k+1) % 2], 0.0f); //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr); ////make asynchronius copy and kernel lauch //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0); //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f); } } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Marshal.Copy(outputPtr2, output, 0, outputSize); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); cuda.DestroyStream(stream0); cuda.Free(mainVecPtr); cuda.DestroyTexture(cuTexRef); return output; }
private static float[] CuRBFCSRCached() { //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction structPassFunc = cuda.GetModuleFunction("RBFspmv_csr_vector"); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); List<float> vecValsL = new List<float>(N * maxRowSize / 2); List<int> vecIdxL = new List<int>(N * maxRowSize / 2); List<int> vecLenghtL = new List<int>(N); float[] vecVals; int[] vecIdx; int[] vecLenght; float[] selfDot = new float[N]; maxIndex = 0; int vecStartIdx = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); vecValsL.AddRange(vals); for (int z = 0; z < vals.Length; z++) { selfDot[i] += vals[z] * vals[z]; } int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); vecIdxL.AddRange(index); vecLenghtL.Add(vecStartIdx); vecStartIdx += vecSize; } //for last index vecLenghtL.Add(vecStartIdx); vecVals = vecValsL.ToArray(); vecIdx = vecIdxL.ToArray(); vecLenght = vecLenghtL.ToArray(); float[] mainVec = new float[maxIndex + 1]; for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++) { int idx = vecIdx[j]; float val = vecVals[j]; mainVec[idx] = val; } Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght); CUdeviceptr selfDotPtr = cuda.CopyHostToDevice(selfDot); //copy to texture CUarray cuArr = cuda.CreateArray(mainVec); cuda.CopyHostToArray(cuArr, mainVec, 0); CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureArray(cuTexRef, cuArr); float[] output = new float[N]; CUdeviceptr dOutput = cuda.Allocate(output); Console.WriteLine("copy to device takes {0}", t.Elapsed); cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, vecLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, selfDotPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, (uint)mainIndex); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, Gamma); offset += sizeof(float); cuda.SetParameter(structPassFunc, offset, (uint)vecStartIdx); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.Free(selfDotPtr); cuda.Free(vecLenghtPtr); cuda.DestroyArray(cuArr); cuda.DestroyTexture(cuTexRef); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return output; }
private static float[] CuDotProdEllPackTexCached() { //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction structPassFunc = cuda.GetModuleFunction("DotProdEllPackCached"); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); float[] vecVals = new float[N * maxRowSize]; int[] vecIdx = new int[N * maxRowSize]; maxIndex = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); //values are column-major aligment for (int z = 0; z < vals.Length; z++) { int m = z * N + i; vecVals[m] = vals[z]; } //Array.Copy(vals,0,vecVals,i*maxRowSize,vals.Length); int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); //Array.Copy(index, 0, vecIdx, i * maxRowSize, index.Length); for (int z = 0; z < index.Length; z++) { int m = z * N + i; vecIdx[m] = index[z]; } } float[] mainVec = new float[maxIndex + 1]; for (int j = 0; j < maxRowSize; j++) { int idx = vecIdx[mainIndex + N * j]; float val = vecVals[mainIndex + N * j]; mainVec[idx] = val; } Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUarray cuArr = cuda.CreateArray(mainVec); cuda.CopyHostToArray(cuArr, mainVec, 0); //CUDAArrayDescriptor cuDesc = new CUDAArrayDescriptor(); //cuDesc.Format = CUArrayFormat.Float; //cuDesc.NumChannels = 1; //cuDesc.Width = maxIndex+1; CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureArray(cuTexRef, cuArr); float[] output = new float[N]; CUdeviceptr dOutput = cuda.Allocate(output); Console.WriteLine("copy to device takes {0}", t.Elapsed); cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)maxRowSize); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("EllPack Cached Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.DestroyArray(cuArr); cuda.DestroyTexture(cuTexRef); return output; }
private static float[] CuDotProdCRSCached(int repetition,string moduleFunction) { //always the same values CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); //temp lists for values, indices and vecotr lenght List<float> vecValsL = new List<float>(N * maxRowSize / 2); List<int> vecIdxL = new List<int>(N * maxRowSize / 2); List<int> vecLenghtL = new List<int>(N+1); float[] vecVals; int[] vecIdx; int[] vecLenght; maxIndex = 0; int vecStartIdx = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); vecValsL.AddRange(vals); int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); vecIdxL.AddRange(index); vecLenghtL.Add(vecStartIdx); vecStartIdx += vecSize; } //for last index vecLenghtL.Add(vecStartIdx); vecVals = vecValsL.ToArray(); vecIdx = vecIdxL.ToArray(); vecLenght = vecLenghtL.ToArray(); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght); float[] output = new float[N]; //CUdeviceptr dOutput = cuda.Allocate(output); IntPtr outputPtr2 = cuda.HostAllocate((uint)(N * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters cuda.SetFunctionBlockShape(cuFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)vecStartIdx); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); cuda.SetTextureFlags(cuTexRef, 0); float[] mainVec = new float[maxIndex + 1]; CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVec); uint memSize = (uint)((maxIndex+1) * sizeof(float)); //uint flags = CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP | CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED; //uint tt = (uint)CUMemHostAllocFlags.WriteCombined; //uint s = (uint)CUMemHostAllocFlags.DeviceMap; //IntPtr mainVecIntPtr = cuda.HostAllocate(memSize, flags); //CUdeviceptr mainVecPtr = cuda.GetHostDevicePointer(mainVecIntPtr, 0); cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize); //CUarray cuArr = cuda.CreateArray(mainVec); //cuda.SetTextureArray(cuTexRef, cuArr); mainIndex = StartingIndex; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int k = 0; k < repetition; k++) { //normal memory management Helpers.InitMainVector(vecVals, vecIdx, vecLenght,mainIndex,ref mainVec); ////copy to texture ////cuda.CopyHostToArray(cuArr, mainVec, 0); cuda.CopyHostToDevice(mainVecPtr, mainVec); cuda.Launch(cuFunc, blocksPerGrid, 1); cuda.SynchronizeContext(); // cuda.CopyDeviceToHost(dOutput, output); Marshal.Copy(outputPtr2, output, 0, N); mainIndex++; } cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); // cuda.CopyDeviceToHost(dOutput, output); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("Dot products with kernel {0}, mainIndex {1} and {2}-vectors takes {3} ms stopwatch time {4} ms",moduleFunction, mainIndex, N, naiveTime, timer.Elapsed); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.Free(vecLenghtPtr); //cuda.DestroyArray(cuArr); //cuda.FreeHost(outputPtr2); //cuda.Free(dOutput); //Marshal.FreeHGlobal( cuda.Free(mainVecPtr); cuda.DestroyTexture(cuTexRef); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return output; }