// It's interesting to change the number of blocks and the number of threads to // understand how to keep the hardware busy. // // Here are some numbers I get on my G80: // blocks - clocks // 1 - 3096 // 8 - 3232 // 16 - 3364 // 32 - 4615 // 64 - 9981 // // With less than 16 blocks some of the multiprocessors of the device are idle. With // more than 16 you are using all the multiprocessors, but there's only one block per // multiprocessor and that doesn't allow you to hide the latency of the memory. With // more than 32 the speed scales linearly. static void Main(string[] args) { // Init CUDA, select 1st device. CUDA cuda = new CUDA(0, true); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "clock_kernel.cubin")); CUfunction func = cuda.GetModuleFunction("timedReduction"); int[] timer = new int[NUM_BLOCKS * 2]; float[] input = new float[NUM_THREADS * 2]; for (int i = 0; i < NUM_THREADS * 2; i++) { input[i] = (float)i; } CUdeviceptr dinput = cuda.CopyHostToDevice<float>(input); CUdeviceptr doutput = cuda.Allocate((uint)(sizeof(float) * NUM_BLOCKS)); CUdeviceptr dtimer = cuda.Allocate<int>(timer); cuda.SetParameter(func, 0, (uint)dinput.Pointer); cuda.SetParameter(func, IntPtr.Size, (uint)doutput.Pointer); cuda.SetParameter(func, IntPtr.Size*2, (uint)dtimer.Pointer); cuda.SetParameterSize(func, (uint)(IntPtr.Size*3)); //timedReduction<<<NUM_BLOCKS, NUM_THREADS, sizeof(float) * 2 * NUM_THREADS>>>(dinput, doutput, dtimer); cuda.SetFunctionBlockShape(func, NUM_THREADS, 1, 1); cuda.SetFunctionSharedSize(func, (uint)(sizeof(float) * 2 * NUM_THREADS)); cuda.Launch(func, NUM_BLOCKS, 1); cuda.CopyDeviceToHost<int>(dtimer, timer); cuda.Free(dinput); cuda.Free(doutput); cuda.Free(dtimer); foreach (int i in timer) { Console.WriteLine(i); } Console.WriteLine("Test PASSED"); int minStart = timer[0]; int maxEnd = timer[NUM_BLOCKS]; for (int i = 1; i < NUM_BLOCKS; i++) { minStart = timer[i] < minStart ? timer[i] : minStart; maxEnd = timer[NUM_BLOCKS + i] > maxEnd ? timer[NUM_BLOCKS + i] : maxEnd; } Console.WriteLine("time = {0}", maxEnd - minStart); }
// // A sorting network is a sorting algorith, where the sequence of comparisons // is not data-dependent. That makes them suitable for parallel implementations. // // Bitonic sort is one of the fastest sorting networks, consisting of o(n log^2 n) // comparators. It has a simple implemention and it's very efficient when sorting // a small number of elements: // // http://citeseer.ist.psu.edu/blelloch98experimental.html // // This implementation is based on: // // http://www.tools-of-computing.com/tc/CS/Sorts/bitonic_sort.htm // static void Main(string[] args) { const int NUM = 256; // Init CUDA, select 1st device. CUDA cuda = new CUDA(0, true); // create values int[] values = new int[NUM]; Random rand = new Random(); for (int i = 0; i < NUM; i++) { values[i] = rand.Next(); } // allocate memory and copy to device CUdeviceptr dvalues = cuda.CopyHostToDevice<int>(values); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "bitonic.cubin")); CUfunction func = cuda.GetModuleFunction("bitonicSort"); cuda.SetParameter(func, 0, (uint)dvalues.Pointer); cuda.SetParameterSize(func, (uint)IntPtr.Size); //bitonicSort<<<1, NUM, sizeof(int) * NUM>>>(dvalues); cuda.SetFunctionBlockShape(func, NUM, 1, 1); cuda.SetFunctionSharedSize(func, sizeof(int) * NUM); cuda.Launch(func, 1, 1); cuda.CopyDeviceToHost<int>(dvalues, values); cuda.Free(dvalues); bool passed = true; for (int i = 1; i < NUM; i++) { if (values[i - 1] > values[i]) { passed = false; break; } } Console.WriteLine("Test {0}", passed ? "PASSED" : "FAILED"); }
private void InitCudaModule() { cuda = new CUDA(0, true); cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName)); cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName); cuFuncGradFinalize = cuda.GetModuleFunction(cudaGradFinalizeName); cuFuncComputeBBstep = cuda.GetModuleFunction(cudaComputeBBStepName); cuFuncObjSquareW = cuda.GetModuleFunction(cudaObjWName); cuFuncObjSquareAlpha = cuda.GetModuleFunction(cudaObjAlphaName); cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW); cuFuncUpdateAlpha = cuda.GetModuleFunction(cudaUpdateAlphaName); cuFuncLinPart = cuda.GetModuleFunction(cudaLinPartName); }
protected void InitCudaModule() { int deviceNr = 0; cuda = new CUDA(deviceNr, true); cuCtx = cuda.CreateContext(deviceNr, CUCtxFlags.MapHost); //cuda.SetCurrentContext(cuCtx); //var ctx = cuda.PopCurrentContext(); //var ctx2 = cuda.PopCurrentContext(); //var ctx3 = cuda.PopCurrentContext(); string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName); if (!File.Exists(modluePath)) { throw new ArgumentException("Failed access to cuda module" + modluePath); } cuModule = cuda.LoadModule(modluePath); cuFunc = cuda.GetModuleFunction(cudaProductKernelName); }
static void Main(string[] args) { // Init and select 1st device. CUDA cuda = new CUDA(0, true); // load module //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.cubin")); cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.ptx")); CUfunction transpose = cuda.GetModuleFunction("transpose"); CUfunction transpose_naive = cuda.GetModuleFunction("transpose_naive"); const int size_x = 4096; const int size_y = 4096; const int mem_size = sizeof(float) * size_x * size_y; float[] h_idata = new float[size_x * size_y]; for (int i = 0; i < h_idata.Length; i++) { h_idata[i] = (float)i; } // allocate device memory // copy host memory to device CUdeviceptr d_idata = cuda.CopyHostToDevice<float>(h_idata); CUdeviceptr d_odata = cuda.Allocate<float>(h_idata); // setup execution parameters cuda.SetFunctionBlockShape(transpose_naive, BLOCK_DIM, BLOCK_DIM, 1); cuda.SetParameter(transpose_naive, 0, (uint)d_odata.Pointer); cuda.SetParameter(transpose_naive, IntPtr.Size, (uint)d_idata.Pointer); cuda.SetParameter(transpose_naive, IntPtr.Size * 2, (uint)size_x); cuda.SetParameter(transpose_naive, IntPtr.Size * 2 + 4, (uint)size_y); cuda.SetParameterSize(transpose_naive, (uint)(IntPtr.Size * 2 + 8)); cuda.SetFunctionBlockShape(transpose, BLOCK_DIM, BLOCK_DIM, 1); cuda.SetParameter(transpose, 0, (uint)d_odata.Pointer); cuda.SetParameter(transpose, IntPtr.Size, (uint)d_idata.Pointer); cuda.SetParameter(transpose, IntPtr.Size * 2, (uint)size_x); cuda.SetParameter(transpose, IntPtr.Size * 2 + 4, (uint)size_y); cuda.SetParameterSize(transpose, (uint)(IntPtr.Size * 2 + 8)); // warmup so we don't time CUDA startup cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM); cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM); //System.Threading.Thread.Sleep(10); int numIterations = 100; Console.WriteLine("Transposing a {0} by {1} matrix of floats...", size_x, size_y); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); cuda.RecordEvent(start); for (int i = 0; i < numIterations; i++) { cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM); } cuda.SynchronizeContext(); cuda.RecordEvent(end); cuda.SynchronizeContext(); float naiveTime = cuda.ElapsedTime(start, end); Console.WriteLine("Naive transpose average time: {0} ms\n", naiveTime / numIterations); cuda.RecordEvent(start); for (int i = 0; i < numIterations; i++) { cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM); } cuda.SynchronizeContext(); cuda.RecordEvent(end); cuda.SynchronizeContext(); float optimizedTime = cuda.ElapsedTime(start, end); Console.WriteLine("Optimized transpose average time: {0} ms\n", optimizedTime / numIterations); float[] h_odata = new float[size_x * size_y]; cuda.CopyDeviceToHost<float>(d_odata, h_odata); float[] reference = new float[size_x * size_y]; computeGold(reference, h_idata, size_x, size_y); bool res = CompareF(reference, h_odata, size_x * size_y); Console.WriteLine("Test {0}", res == true? "PASSED":"FAILED"); cuda.Free(d_idata); cuda.Free(d_odata); Console.ReadKey(); }
static private void Worker(object cState) { try { Command cCmd; CUDA cCUDA = new CUDA(true); for (int i = 0; i < 10; i++) { try { cCUDA.CreateContext(i); (new Logger()).WriteDebug2(i + ": success"); break; } catch (Exception ex) { (new Logger()).WriteDebug2(i + ": failed"); if (Logger.bDebug && Logger.Level.debug3 > Logger.eLevelMinimum) (new Logger()).WriteError(ex); } } uint nMemoryReservedForMerge = 2 * 1024 * 1024; //PREFERENCES типа <memory reserved="2097152" /> uint nMemoryStarvationThreshold = cCUDA.TotalMemory / 2; //PREFERENCES через проценты... типа <memory starvation="50%" /> uint nMemoryFree; string sModule = "CUDAFunctions_" + Preferences.nCUDAVersion + "_x" + (IntPtr.Size * 8); if (Logger.bDebug) (new Logger()).WriteDebug3(sModule); cCUDA.LoadModule((byte[])Properties.Resource.ResourceManager.GetObject(sModule)); // $(ProjectDir)Resources\CUDAFunctions.cubin //cCUDA.LoadModule(@"c:\projects\!helpers\video\PixelsMap\Resources\CUDAFunctions.cubin"); CUfunction cCUDAFuncMerge = cCUDA.GetModuleFunction("CUDAFrameMerge"); int nThreadsPerBlock = 256; //пришлось уменьшить с 512 до 256 сридов на блок, потому что при добавлении "движения" и операций с float, ловил ошибку: Too Many Resources Requested for Launch (This error means that the number of registers available on the multiprocessor is being exceeded. Reduce the number of threads per block to solve the problem) cCUDA.SetFunctionBlockShape(cCUDAFuncMerge, nThreadsPerBlock, 1, 1); CUDADriver.cuParamSetSize(cCUDAFuncMerge, 8); Dictionary<ulong, CUdeviceptr> ahDevicePointers = new Dictionary<ulong, CUdeviceptr>(); CUdeviceptr cPMs; CUdeviceptr cInfos; CUdeviceptr cAlphaMap; { //IntPtr[] aPointersByAlpha = new IntPtr[254]; //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds //IntPtr[] aPointersByBackground = new IntPtr[256]; // те самые массивы поинтеров B, т.е. BackGrounds byte[] aAlphaMap = new byte[16646144]; int nResult, nIndx = 0; for (byte nAlpha = 1; 255 > nAlpha; nAlpha++) { for (ushort nBackground = 0; 256 > nBackground; nBackground++) { for (ushort nForeground = 0; 256 > nForeground; nForeground++) { if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5))) nResult = 255; aAlphaMap[nIndx++] = (byte)nResult; } //aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer; } //aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer; } cAlphaMap = cCUDA.CopyHostToDevice<byte>(aAlphaMap); } //{ // IntPtr[] aPointersByAlpha = new IntPtr[254]; //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds // IntPtr[] aPointersByBackground = new IntPtr[256]; // те самые массивы поинтеров B, т.е. BackGrounds // byte[] aResults = new byte[256]; // int nResult; // for (byte nAlpha = 1; 255 > nAlpha; nAlpha++) // { // for (ushort nBackground = 0; 256 > nBackground; nBackground++) // { // for (ushort nForeground = 0; 256 > nForeground; nForeground++) // { // if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5))) // nResult = 255; // aResults[nForeground] = (byte)nResult; // } // aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer; // } // aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer; // } // cAlphaMap = cCUDA.CopyHostToDevice<IntPtr>(aPointersByAlpha); //} #if DEBUG Dictionary<ulong, DateTime> ahDebug = new Dictionary<ulong,DateTime>(); #endif DateTime dtNextTime = DateTime.MinValue, dtNow; long nStartTick; // logging while (true) { if (1 > _aqCommands.CountGet() && (dtNow = DateTime.Now) > dtNextTime) { dtNextTime = dtNow.AddSeconds(60); #if DEBUG dtNow = dtNow.Subtract(TimeSpan.FromHours(2)); string sMessage = ""; foreach (ulong nID in ahDebug.Keys) if (dtNow > ahDebug[nID]) sMessage += "<br>[" + nID + ":" + ahDebug[nID].ToString("HH:mm:ss") + "]"; #endif (new Logger()).WriteDebug("CUDA free memory:" + cCUDA.FreeMemory #if DEBUG + "; possibly timeworn allocations:" + (1 > sMessage.Length ? "no" : sMessage) #endif ); } while (true) { try { cCmd = _aqCommands.Dequeue(); //если нечего отдать - заснёт break; } catch (Exception ex) { (new Logger()).WriteError(ex); } } _CommandsCount = _aqCommands.nCount; switch (cCmd.eID) { case Command.ID.Allocate: #region try { cCmd.cPM._cException = null; if (1 > cCmd.cPM._nID) { if (0 < cCmd.cPM._nBytesQty) { nMemoryFree = cCUDA.FreeMemory; if (nMemoryReservedForMerge < nMemoryFree - cCmd.cPM._nBytesQty) { bMemoryStarvation = (nMemoryFree < nMemoryStarvationThreshold); cCmd.cPM._nID = _nCurrentID++; ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.Allocate(cCmd.cPM._nBytesQty)); #if DEBUG ahDebug.Add(cCmd.cPM._nID, DateTime.Now); #endif } else { bMemoryStarvation = true; throw new Exception("out of memory in CUDA device during Allocate. Only 2 MBytes reserved for the Merge"); } } else throw new Exception("bytes quantity in PixelsMap have to be greater than zero for Allocate [_bDisposed = " + cCmd.cPM._bDisposed + "][_bProcessing = " + cCmd.cPM._bProcessing + "][_bShiftVertical = " + cCmd.cPM._bShiftVertical + "][_bTemp = " + cCmd.cPM._bTemp + "][_dt = " + cCmd.cPM._dt + "][_nBytesQty = " + cCmd.cPM._nBytesQty + "][_nID = " + cCmd.cPM._nID + "][_nShiftPosition = " + cCmd.cPM._nShiftPosition + "][_stArea.nHeight = " + cCmd.cPM._stArea.nHeight + "][_stArea.nWidth = " + cCmd.cPM._stArea.nWidth + "][bKeepAlive = " + cCmd.cPM.bKeepAlive + "][bBackgroundClear = " + cCmd.cPM.bBackgroundClear + "][eAlpha = " + cCmd.cPM.eAlpha + "][bCUDA = " + cCmd.cPM.bCUDA + "][nAlphaConstant = " + cCmd.cPM.nAlphaConstant + "][nID = " + cCmd.cPM.nID + "][nLength = " + cCmd.cPM.nLength + "][stArea.nHeight = " + cCmd.cPM.stArea.nHeight + "][stArea.nWidth = " + cCmd.cPM.stArea.nWidth + "]"); } else throw new Exception("PixelsMap ID have to be zero for Allocate"); } catch (Exception ex) { if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); (new Logger()).WriteDebug("bytes qty:" + cCmd.cPM._nBytesQty); cCmd.cPM._cException = ex; } cCmd.cMRE.Set(); break; #endregion case Command.ID.CopyIn: #region nStartTick = DateTime.Now.Ticks; // logging try { cCmd.cPM._cException = null; if (1 > cCmd.cPM._nID) { if (cCUDA.FreeMemory - cCmd.cPM._nBytesQty > nMemoryReservedForMerge) { cCmd.cPM._nID = _nCurrentID++; if (cCmd.ahParameters.ContainsKey(typeof(IntPtr))) ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty)); else if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((byte[])cCmd.ahParameters[typeof(byte[])])); else throw new Exception("unknown parameter type"); #if DEBUG ahDebug.Add(cCmd.cPM._nID, DateTime.Now); #endif } else throw new Exception("out of memory in CUDA device during CopyIn. Only 2 MBytes reserved for the Merge."); } else { if (cCmd.ahParameters.ContainsKey(typeof(IntPtr))) cCUDA.CopyHostToDevice(ahDevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty); else if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) cCUDA.CopyHostToDevice(ahDevicePointers[cCmd.cPM._nID], (byte[])cCmd.ahParameters[typeof(byte[])]); else throw new Exception("unknown parameter type"); } if (ahDevicePointers.ContainsKey(cCmd.cPM._nID)) (new Logger()).WriteDebug5("copy in [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]"); else (new Logger()).WriteDebug5("copy in [id:" + cCmd.cPM._nID + "][ptr: not in dictionary]"); } catch (Exception ex) { if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); cCmd.cPM._cException = ex; } if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20) // logging (new Logger()).WriteNotice("PixelMap: Command.ID.CopyIn: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms"); // logging cCmd.cMRE.Set(); break; #endregion case Command.ID.CopyOut: #region nStartTick = DateTime.Now.Ticks; // logging try { if (0 < cCmd.cPM._nID) { if(!cCmd.ahParameters.ContainsKey(typeof(IntPtr))) { if(cCmd.ahParameters.ContainsKey(typeof(byte[]))) { cCmd.cPM._aBytes = (byte[])cCmd.ahParameters[typeof(byte[])]; if(cCmd.cPM._nBytesQty != cCmd.cPM._aBytes.Length) (new Logger()).WriteWarning("wrong array size for copyout [got:" + cCmd.cPM._aBytes.Length + "][expected:" + cCmd.cPM._nBytesQty + "]"); } else cCmd.cPM._aBytes = new byte[cCmd.cPM._nBytesQty]; cCUDA.CopyDeviceToHost<byte>(ahDevicePointers[cCmd.cPM._nID], cCmd.cPM._aBytes); } else cCUDA.CopyDeviceToHost(ahDevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty); (new Logger()).WriteDebug5("copy out [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]"); } else throw new Exception("PixelsMap have to be allocated for CopyOut"); } catch (Exception ex) { if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); cCmd.cPM._cException = ex; } if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20) // logging (new Logger()).WriteNotice("PixelMap: Command.ID.CopyOut: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms"); // logging cCmd.cMRE.Set(); break; #endregion case Command.ID.Merge: #region try { List<PixelsMap> aPMs = (List<PixelsMap>)cCmd.ahParameters[typeof(List<PixelsMap>)]; DisCom.MergeInfo cMergeInfo = (DisCom.MergeInfo)cCmd.ahParameters[typeof(DisCom.MergeInfo)]; List<IntPtr> aDPs = new List<IntPtr>(); if (1 > cCmd.cPM._nID) throw new Exception("background PixelsMap have to be allocated for Merge"); aDPs.Add((IntPtr)ahDevicePointers[cCmd.cPM._nID].Pointer); for (int nIndx = 0; nIndx < aPMs.Count; nIndx++) { if (!ahDevicePointers.ContainsKey(aPMs[nIndx]._nID)) throw new Exception("there is a corrupted ID in layers for merge [id:" + aPMs[nIndx]._nID + "]"); if (1 > ahDevicePointers[aPMs[nIndx]._nID].Pointer) throw new Exception("there is an empty pointer in layers for merge [id:" + aPMs[nIndx]._nID + "]"); aDPs.Add((IntPtr)ahDevicePointers[aPMs[nIndx]._nID].Pointer); } cPMs = cCUDA.CopyHostToDevice<IntPtr>(aDPs.ToArray()); cInfos = cCUDA.CopyHostToDevice(cMergeInfo, cMergeInfo.SizeGet()); cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, 0, (IntPtr)cPMs.Pointer); cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, IntPtr.Size, (IntPtr)cInfos.Pointer); cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, IntPtr.Size * 2, (IntPtr)cAlphaMap.Pointer); cCUDA.SetParameterSize(cCUDAFuncMerge, (uint)(IntPtr.Size * 3)); int nIterations = (0 == cMergeInfo.nBackgroundSize % nThreadsPerBlock ? cMergeInfo.nBackgroundSize / nThreadsPerBlock : cMergeInfo.nBackgroundSize / nThreadsPerBlock + 1); cCUDA.Launch(cCUDAFuncMerge, nIterations, 1); cCmd.cMRE.Set(); cMergeInfo.Dispose(); cCUDA.Free(cPMs); cCUDA.Free(cInfos); for (int nIndx = 0; nIndx < aPMs.Count; nIndx++) { lock (aPMs[nIndx]._cSyncRoot) aPMs[nIndx]._bProcessing = false; aPMs[nIndx].Dispose(); } } catch (Exception ex) { cCmd.cMRE.Set(); if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); cCmd.cPM._cException = ex; } break; #endregion case Command.ID.Dispose: #region nStartTick = DateTime.Now.Ticks; // logging (new Logger()).Write(Logger.Level.debug2, "dispose: in"); try { if (ahDevicePointers.ContainsKey(cCmd.cPM._nID)) { if (0 < cCmd.cPM._nID && 0 < ahDevicePointers[cCmd.cPM._nID].Pointer) { cCUDA.Free(ahDevicePointers[cCmd.cPM._nID]); //cCUDA.SynchronizeContext(); bMemoryStarvation = (cCUDA.FreeMemory < nMemoryStarvationThreshold); (new Logger()).WriteDebug3("dispose [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]"); } ahDevicePointers.Remove(cCmd.cPM._nID); #if DEBUG ahDebug.Remove(cCmd.cPM._nID); #endif cCmd.cPM._nID = 0; } } catch (Exception ex) { if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); cCmd.cPM._cException = ex; } (new Logger()).Write(Logger.Level.debug2, "dispose: out"); if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20) // logging (new Logger()).WriteNotice("PixelMap: Command.ID.Dispose: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms"); // logging break; #endregion } } } catch (Exception ex) { (new Logger()).WriteError(ex); } }
/// <summary> /// implementation of sparese matrix product /// </summary> /// <param name="repetition">how many times kernel should be launch</param> /// <param name="moduleFunction">cuda kenrel name</param> /// <param name="blockSizeX">block size X</param> /// <param name="blockSizeY">block size Y</param> /// <param name="transposeGrid">indicate that grid dimensions should be /// computed alternativly, if false than gridDimY- connected with rows /// else gridDim.Y conected with cols</param> /// <returns></returns> public static float[] CRSSparseMM(int repetition, string moduleFunction, int blockSizeX, int blockSizeY, bool transposeGrid) { //int blockSizeX = 4; //int blockSizeY = 4; CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); CUdeviceptr BValsPtr = cuda.CopyHostToDevice(BVals); CUdeviceptr BIdxPtr = cuda.CopyHostToDevice(BIdx); CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //CUdeviceptr dOutput = cuda.Allocate(output); IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; int Belements = BVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Aelements); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Belements); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); //cuda.SetTextureFlags(cuTexRef, 0); int gridDimX = (int)Math.Ceiling((Cols + 0.0) / (blockSizeX)); int gridDimY = (int)Math.Ceiling((0.0 + Rows) / blockSizeY); if (transposeGrid) { gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY); } Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int k = 0; k < repetition; k++) { cuda.Launch(cuFunc, gridDimX, gridDimY); cuda.SynchronizeContext(); // cuda.CopyDeviceToHost(dOutput, output); Marshal.Copy(outputPtr2, output, 0, outputSize); } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(BValsPtr); cuda.Free(BIdxPtr); cuda.Free(BLenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return(output); }
public static float[] CRSSparseMMwithDenseVector(int repetition, string moduleFunction, int blockSizeX, int blockSizeY) { CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //allocate memory for output IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); //create dense vector for each column in B matrix float[] mainVec = new float[maxIndex + 1]; uint memSize = (uint)((maxIndex + 1) * sizeof(float)); CUstream stream0 = cuda.CreateStream(); IntPtr[] mainVecIntPtrs = new IntPtr[2]; //write combined memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); // //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); mainVecIntPtrs[0] = cuda.AllocateHost(memSize); mainVecIntPtrs[1] = cuda.AllocateHost(memSize); CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0); //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); //mapped memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize); //get texture reference CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); int colIndexParamOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); int gridDim = (Rows + blockSizeX - 1) / blockSizeX; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int rep = 0; rep < repetition; rep++) { for (int k = 0; k < Cols; k++) { Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream0); cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0); cuda.SetParameter(cuFunc, colIndexParamOffset, (uint)k); cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer Helpers.SetBufferIdx(BIdx, BRowLen, k - 1, mainVecIntPtrs[(k + 1) % 2], 0.0f); //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr); ////make asynchronius copy and kernel lauch //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0); //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f); } } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Marshal.Copy(outputPtr2, output, 0, outputSize); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); cuda.DestroyStream(stream0); cuda.Free(mainVecPtr); cuda.DestroyTexture(cuTexRef); return(output); }
static void Main(string[] args) { // Create a new instance of CUDA class, select 1st device. CUDA cuda = new CUDA(0, true); // Prepare parameters. int n = 16 * 1024 * 1024; uint nbytes = (uint)(n * sizeof(int)); int value = 26; // allocate host memory int[] a = new int[n]; // allocate device memory CUdeviceptr d_a = cuda.Allocate <int>(a); CUDADriver.cuMemsetD8(d_a, 0xff, nbytes); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx")); CUfunction func = cuda.GetModuleFunction("increment_kernel"); // set kernel launch configuration cuda.SetFunctionBlockShape(func, 512, 1, 1); // create cuda event handles CUevent start = cuda.CreateEvent(); CUevent stop = cuda.CreateEvent(); // asynchronously issue work to the GPU (all to stream 0) CUstream stream = new CUstream(); cuda.RecordEvent(start); cuda.CopyHostToDeviceAsync <int>(d_a, a, stream); // set parameters for kernel function cuda.SetParameter(func, 0, (uint)d_a.Pointer); cuda.SetParameter(func, IntPtr.Size, (uint)value); cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4)); // actually launch kernel cuda.LaunchAsync(func, n / 512, 1, stream); // wait for every thing to finish, then start copy back data cuda.CopyDeviceToHostAsync <int>(d_a, a, stream); cuda.RecordEvent(stop); // print the cpu and gpu times Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop)); // check the output for correctness if (CorrectOutput(a, value)) { Console.WriteLine("Test PASSED"); } else { Console.WriteLine("Test FAILED"); } // release resources cuda.DestroyEvent(start); cuda.DestroyEvent(stop); cuda.Free(d_a); }
//private static void InitMainVector(float[] vecVals, int[] vecIdx, int[] vecLenght, float[] mainVec) //{ // for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++) // { // int idx = vecIdx[j]; // float val = vecVals[j]; // mainVec[idx] = val; // } //} private static float[] CuDotProdCSRwriteCombined(int repetition) { //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction cuFunc = cuda.GetModuleFunction("spmv_csr_vector_kernel_wc"); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); //temp lists for values, indices and vecotr lenght List<float> vecValsL = new List<float>(N * maxRowSize / 2); List<int> vecIdxL = new List<int>(N * maxRowSize / 2); List<int> vecLenghtL = new List<int>(N); float[] vecVals; int[] vecIdx; int[] vecLenght; maxIndex = 0; int vecStartIdx = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); vecValsL.AddRange(vals); int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); vecIdxL.AddRange(index); vecLenghtL.Add(vecStartIdx); vecStartIdx += vecSize; } //for last index vecLenghtL.Add(vecStartIdx); vecVals = vecValsL.ToArray(); vecIdx = vecIdxL.ToArray(); vecLenght = vecLenghtL.ToArray(); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght); float[] output = new float[N]; //CUdeviceptr dOutput = cuda.Allocate(output); IntPtr outputPtr2 = cuda.HostAllocate((uint)(N * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); uint memSize = (uint)((maxIndex + 1) * sizeof(float)); uint flags = CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP | CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED; uint tt = (uint)CUMemHostAllocFlags.WriteCombined; uint s = (uint)CUMemHostAllocFlags.DeviceMap; IntPtr mainVecIntPtr = cuda.HostAllocate(memSize, flags); CUdeviceptr mainVecPtr = cuda.GetHostDevicePointer(mainVecIntPtr, 0); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters cuda.SetFunctionBlockShape(cuFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, vecLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, mainVecPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)vecStartIdx); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); mainIndex = StartingIndex; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int k = 0; k < repetition; k++) { //float[] tempFloatarr = new float[memSize]; Helpers.InitBuffer(vecVals, vecIdx, vecLenght,mainIndex, mainVecIntPtr); //Marshal.Copy(mainVecIntPtr, tempFloatarr, 0, tempFloatarr.Length); cuda.Launch(cuFunc, blocksPerGrid, 1); cuda.SynchronizeContext(); //cuda.CopyDeviceToHost(dOutput, output); Marshal.Copy(outputPtr2, output, 0, N); //mainVec = new float[maxIndex + 1]; //Array.Clear(mainVec, 0, mainVec.Length); //clear previous vector values Helpers.SetBufferIdx(vecIdx, vecLenght,mainIndex, mainVecIntPtr,0.0f); mainIndex++; } cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); // cuda.CopyDeviceToHost(dOutput, output); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.Free(vecLenghtPtr); //cuda.DestroyArray(cuArr); cuda.Free(mainVecPtr); //cuda.DestroyTexture(cuTexRef); // cuda.Free(mainVecPtr); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return output; }
static void Main(string[] args) { // Init and select 1st device. CUDA cuda = new CUDA(0, true); // load module //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.cubin")); cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "transpose_kernel.ptx")); CUfunction transpose = cuda.GetModuleFunction("transpose"); CUfunction transpose_naive = cuda.GetModuleFunction("transpose_naive"); const int size_x = 4096; const int size_y = 4096; const int mem_size = sizeof(float) * size_x * size_y; float[] h_idata = new float[size_x * size_y]; for (int i = 0; i < h_idata.Length; i++) { h_idata[i] = (float)i; } // allocate device memory // copy host memory to device CUdeviceptr d_idata = cuda.CopyHostToDevice <float>(h_idata); CUdeviceptr d_odata = cuda.Allocate <float>(h_idata); // setup execution parameters cuda.SetFunctionBlockShape(transpose_naive, BLOCK_DIM, BLOCK_DIM, 1); cuda.SetParameter(transpose_naive, 0, (uint)d_odata.Pointer); cuda.SetParameter(transpose_naive, IntPtr.Size, (uint)d_idata.Pointer); cuda.SetParameter(transpose_naive, IntPtr.Size * 2, (uint)size_x); cuda.SetParameter(transpose_naive, IntPtr.Size * 2 + 4, (uint)size_y); cuda.SetParameterSize(transpose_naive, (uint)(IntPtr.Size * 2 + 8)); cuda.SetFunctionBlockShape(transpose, BLOCK_DIM, BLOCK_DIM, 1); cuda.SetParameter(transpose, 0, (uint)d_odata.Pointer); cuda.SetParameter(transpose, IntPtr.Size, (uint)d_idata.Pointer); cuda.SetParameter(transpose, IntPtr.Size * 2, (uint)size_x); cuda.SetParameter(transpose, IntPtr.Size * 2 + 4, (uint)size_y); cuda.SetParameterSize(transpose, (uint)(IntPtr.Size * 2 + 8)); // warmup so we don't time CUDA startup cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM); cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM); //System.Threading.Thread.Sleep(10); int numIterations = 100; Console.WriteLine("Transposing a {0} by {1} matrix of floats...", size_x, size_y); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); cuda.RecordEvent(start); for (int i = 0; i < numIterations; i++) { cuda.Launch(transpose_naive, size_x / BLOCK_DIM, size_y / BLOCK_DIM); } cuda.SynchronizeContext(); cuda.RecordEvent(end); cuda.SynchronizeContext(); float naiveTime = cuda.ElapsedTime(start, end); Console.WriteLine("Naive transpose average time: {0} ms\n", naiveTime / numIterations); cuda.RecordEvent(start); for (int i = 0; i < numIterations; i++) { cuda.Launch(transpose, size_x / BLOCK_DIM, size_y / BLOCK_DIM); } cuda.SynchronizeContext(); cuda.RecordEvent(end); cuda.SynchronizeContext(); float optimizedTime = cuda.ElapsedTime(start, end); Console.WriteLine("Optimized transpose average time: {0} ms\n", optimizedTime / numIterations); float[] h_odata = new float[size_x * size_y]; cuda.CopyDeviceToHost <float>(d_odata, h_odata); float[] reference = new float[size_x * size_y]; computeGold(reference, h_idata, size_x, size_y); bool res = CompareF(reference, h_odata, size_x * size_y); Console.WriteLine("Test {0}", res == true? "PASSED":"FAILED"); cuda.Free(d_idata); cuda.Free(d_odata); Console.ReadKey(); }
/// <summary> /// implementation of sparese matrix product /// </summary> /// <param name="repetition">how many times kernel should be launch</param> /// <param name="moduleFunction">cuda kenrel name</param> /// <param name="blockSizeX">block size X</param> /// <param name="blockSizeY">block size Y</param> /// <param name="transposeGrid">indicate that grid dimensions should be /// computed alternativly, if false than gridDimY- connected with rows /// else gridDim.Y conected with cols</param> /// <returns></returns> public static float[] CRSSparseMM(int repetition, string moduleFunction, int blockSizeX,int blockSizeY, bool transposeGrid) { //int blockSizeX = 4; //int blockSizeY = 4; CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen,out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen,out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); CUdeviceptr BValsPtr = cuda.CopyHostToDevice(BVals); CUdeviceptr BIdxPtr = cuda.CopyHostToDevice(BIdx); CUdeviceptr BLenghtPtr = cuda.CopyHostToDevice(BRowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //CUdeviceptr dOutput = cuda.Allocate(output); IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; int Belements = BVals.Length; cuda.SetFunctionBlockShape(cuFunc,blockSizeX,blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, BLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Aelements); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Belements); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); //CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); //cuda.SetTextureFlags(cuTexRef, 0); int gridDimX =(int) Math.Ceiling((Cols + 0.0) / (blockSizeX)); int gridDimY = (int)Math.Ceiling((0.0+Rows)/blockSizeY); if (transposeGrid) { gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); gridDimY = (int)Math.Ceiling((0.0 + Cols) / blockSizeY); } Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int k = 0; k < repetition; k++) { cuda.Launch(cuFunc, gridDimX, gridDimY); cuda.SynchronizeContext(); // cuda.CopyDeviceToHost(dOutput, output); Marshal.Copy(outputPtr2, output, 0, outputSize); } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Console.WriteLine("Matrix products with kernel {0}",moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(BValsPtr); cuda.Free(BIdxPtr); cuda.Free(BLenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return output; }
public static float[] CRSSparseMMwithDenseVector(int repetition, string moduleFunction, int blockSizeX, int blockSizeY) { CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "matrixKernels.cubin")); CUfunction cuFunc = cuda.GetModuleFunction(moduleFunction); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("------------------------------------"); Console.WriteLine("init Matrix"); Stopwatch t = Stopwatch.StartNew(); //values in CRS format float[] AVals, BVals; //indexes in Crs format int[] AIdx, BIdx; //Lenght of each row in CRS format int[] ARowLen, BRowLen; int maxIndex = 0; MakeRandCrsSparseMatrix(Rows, maxRowSize, out AVals, out AIdx, out ARowLen, out maxIndex); // DisplayCrsMatrix(AVals, AIdx, ARowLen,maxIndex); MakeRandCrsSparseMatrix(Cols, maxRowSize, out BVals, out BIdx, out BRowLen, out maxIndex); //DisplayCrsMatrix(BVals, BIdx, BRowLen, maxIndex); Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr AValsPtr = cuda.CopyHostToDevice(AVals); CUdeviceptr AIdxPtr = cuda.CopyHostToDevice(AIdx); CUdeviceptr ALenghtPtr = cuda.CopyHostToDevice(ARowLen); int outputSize = Rows * Cols; float[] output = new float[outputSize]; //allocate memory for output IntPtr outputPtr2 = cuda.HostAllocate((uint)(outputSize * sizeof(float)), CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); CUdeviceptr dOutput = cuda.GetHostDevicePointer(outputPtr2, 0); //create dense vector for each column in B matrix float[] mainVec = new float[maxIndex + 1]; uint memSize = (uint)((maxIndex + 1) * sizeof(float)); CUstream stream0 =cuda.CreateStream(); IntPtr[] mainVecIntPtrs= new IntPtr[2]; //write combined memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); // //mainVecIntPtrs[0] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); //mainVecIntPtrs[1] = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_WRITECOMBINED); mainVecIntPtrs[0] = cuda.AllocateHost(memSize); mainVecIntPtrs[1] = cuda.AllocateHost(memSize); CUdeviceptr mainVecPtr = cuda.CopyHostToDeviceAsync(mainVecIntPtrs[0], memSize, stream0); //IntPtr mainVecIPtr = cuda.HostAllocate(memSize,CUDADriver.CU_MEMHOSTALLOC_PORTABLE); //CUdeviceptr mainVecPtr=cuda.CopyHostToDeviceAsync(mainVecIPtr,memSize,stream0); //mapped memory allocation //IntPtr mainVecIPtr = cuda.HostAllocate(memSize, CUDADriver.CU_MEMHOSTALLOC_DEVICEMAP); //CUdeviceptr mainVecPtr = cuda.CopyHostToDevice(mainVecIPtr, memSize); //get texture reference CUtexref cuTexRef = cuda.GetModuleTexture(module, "vectorTexRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureAddress(cuTexRef, mainVecPtr, memSize); Console.WriteLine("copy to device takes {0}", t.Elapsed); #region set cuda parameters int Aelements = AVals.Length; cuda.SetFunctionBlockShape(cuFunc, blockSizeX, blockSizeY, 1); int offset = 0; cuda.SetParameter(cuFunc, offset, AValsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, AIdxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, ALenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(cuFunc, offset, (uint)Rows); offset += sizeof(int); cuda.SetParameter(cuFunc, offset, (uint)Cols); offset += sizeof(int); int colIndexParamOffset = offset; cuda.SetParameter(cuFunc, offset, (uint)0); offset += sizeof(int); cuda.SetParameterSize(cuFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); int gridDimX = (int)Math.Ceiling((Rows + 0.0) / (blockSizeX)); int gridDim= (Rows + blockSizeX - 1) / blockSizeX; Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); for (int rep = 0; rep < repetition; rep++) { for (int k = 0; k < Cols; k++) { Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIntPtrs[k % 2]); cuda.SynchronizeStream(stream0); cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIntPtrs[k % 2], memSize, stream0); cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer Helpers.SetBufferIdx(BIdx, BRowLen, k-1, mainVecIntPtrs[(k+1) % 2], 0.0f); //Helpers.InitBuffer(BVals, BIdx, BRowLen, k, mainVecIPtr); ////make asynchronius copy and kernel lauch //cuda.CopyHostToDeviceAsync(mainVecPtr, mainVecIPtr, memSize, stream0); //cuda.SetParameter(cuFunc, colIndexParamOffset,(uint) k); //cuda.LaunchAsync(cuFunc, gridDimX, 1, stream0); //cuda.SynchronizeStream(stream0); ////clear host buffer //Helpers.SetBufferIdx(BIdx, BRowLen, k, mainVecIPtr, 0.0f); } } cuda.RecordEvent(end); cuda.SynchronizeContext(); timer.Stop(); float cudaTime = cuda.ElapsedTime(start, end); Marshal.Copy(outputPtr2, output, 0, outputSize); Console.WriteLine("Matrix products with kernel {0}", moduleFunction); Console.WriteLine(" takes {0} ms stopwatch time {1} ms", cudaTime, timer.Elapsed); int lenght = displayCount;// Math.Min(displayCount, Rows); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(AValsPtr); cuda.Free(AIdxPtr); cuda.Free(ALenghtPtr); cuda.Free(dOutput); cuda.DestroyEvent(start); cuda.DestroyEvent(end); cuda.DestroyStream(stream0); cuda.Free(mainVecPtr); cuda.DestroyTexture(cuTexRef); return output; }
private static float[] CuRBFCSRCached() { //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction structPassFunc = cuda.GetModuleFunction("RBFspmv_csr_vector"); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); List<float> vecValsL = new List<float>(N * maxRowSize / 2); List<int> vecIdxL = new List<int>(N * maxRowSize / 2); List<int> vecLenghtL = new List<int>(N); float[] vecVals; int[] vecIdx; int[] vecLenght; float[] selfDot = new float[N]; maxIndex = 0; int vecStartIdx = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); vecValsL.AddRange(vals); for (int z = 0; z < vals.Length; z++) { selfDot[i] += vals[z] * vals[z]; } int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); vecIdxL.AddRange(index); vecLenghtL.Add(vecStartIdx); vecStartIdx += vecSize; } //for last index vecLenghtL.Add(vecStartIdx); vecVals = vecValsL.ToArray(); vecIdx = vecIdxL.ToArray(); vecLenght = vecLenghtL.ToArray(); float[] mainVec = new float[maxIndex + 1]; for (int j = vecLenght[mainIndex]; j < vecLenght[mainIndex + 1]; j++) { int idx = vecIdx[j]; float val = vecVals[j]; mainVec[idx] = val; } Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUdeviceptr vecLenghtPtr = cuda.CopyHostToDevice(vecLenght); CUdeviceptr selfDotPtr = cuda.CopyHostToDevice(selfDot); //copy to texture CUarray cuArr = cuda.CreateArray(mainVec); cuda.CopyHostToArray(cuArr, mainVec, 0); CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureArray(cuTexRef, cuArr); float[] output = new float[N]; CUdeviceptr dOutput = cuda.Allocate(output); Console.WriteLine("copy to device takes {0}", t.Elapsed); cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, vecLenghtPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, selfDotPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, (uint)mainIndex); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, Gamma); offset += sizeof(float); cuda.SetParameter(structPassFunc, offset, (uint)vecStartIdx); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("csr vector Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.Free(selfDotPtr); cuda.Free(vecLenghtPtr); cuda.DestroyArray(cuArr); cuda.DestroyTexture(cuTexRef); cuda.DestroyEvent(start); cuda.DestroyEvent(end); return output; }
private static unsafe float[] CuDotProdSparseVecStruct() { int sparseVecSize = sizeof(SparseVecPtr); uint size = (uint)(N * sizeof(SparseVecPtr)); //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); //CUfunction structPassFunc = cuda.GetModuleFunction("DotProd"); CUfunction structPassFunc = cuda.GetModuleFunction("DotProd2"); SparseVecPtr[] vectors = new SparseVecPtr[N]; Console.WriteLine("init and copy data"); Stopwatch t = Stopwatch.StartNew(); mainIndex = StartingIndex; for (int i = 0; i < N; i++) { vectors[i] = new SparseVecPtr(); int vecSize = avgElements + i % stdElements; vectors[i].size = vecSize; float[] vals = Helpers.InitValues(i, vecSize, maxVal); int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(index); vectors[i].indices = new IntPtr(idxPtr.Pointer); vectors[i].values = (IntPtr)valsPtr.Pointer; } GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned); IntPtr ptr = handle.AddrOfPinnedObject(); float[] output = new float[N]; //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors); CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size); CUdeviceptr dOutput = cuda.Allocate(output); Console.WriteLine("copy and init takes {0}", t.Elapsed); #region set cuda parameters cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, dVectors.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)mainIndex); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); #endregion Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(dVectors); cuda.Free(dOutput); return output; }
private static float[] CuDotProdEllPackTexCached() { //always the same values Random rnd = new Random(1); CUDA cuda = new CUDA(0, true); // load module CUmodule module = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction structPassFunc = cuda.GetModuleFunction("DotProdEllPackCached"); int maxRowSize = avgElements + stdElements - 1; Console.WriteLine("init arrays"); Stopwatch t = Stopwatch.StartNew(); float[] vecVals = new float[N * maxRowSize]; int[] vecIdx = new int[N * maxRowSize]; maxIndex = 0; for (int i = 0; i < N; i++) { int vecSize = avgElements + i % stdElements; float[] vals = Helpers.InitValues(i, vecSize, maxVal); //values are column-major aligment for (int z = 0; z < vals.Length; z++) { int m = z * N + i; vecVals[m] = vals[z]; } //Array.Copy(vals,0,vecVals,i*maxRowSize,vals.Length); int[] index = Helpers.InitIndices(i, vecSize, ref maxIndex); //Array.Copy(index, 0, vecIdx, i * maxRowSize, index.Length); for (int z = 0; z < index.Length; z++) { int m = z * N + i; vecIdx[m] = index[z]; } } float[] mainVec = new float[maxIndex + 1]; for (int j = 0; j < maxRowSize; j++) { int idx = vecIdx[mainIndex + N * j]; float val = vecVals[mainIndex + N * j]; mainVec[idx] = val; } Console.WriteLine("Init takes {0}", t.Elapsed); t.Start(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vecVals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(vecIdx); CUarray cuArr = cuda.CreateArray(mainVec); cuda.CopyHostToArray(cuArr, mainVec, 0); //CUDAArrayDescriptor cuDesc = new CUDAArrayDescriptor(); //cuDesc.Format = CUArrayFormat.Float; //cuDesc.NumChannels = 1; //cuDesc.Width = maxIndex+1; CUtexref cuTexRef = cuda.GetModuleTexture(module, "texRef"); cuda.SetTextureFlags(cuTexRef, 0); cuda.SetTextureArray(cuTexRef, cuArr); float[] output = new float[N]; CUdeviceptr dOutput = cuda.Allocate(output); Console.WriteLine("copy to device takes {0}", t.Elapsed); cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, valsPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, idxPtr.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)maxRowSize); offset += sizeof(int); cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); Console.WriteLine("start computation"); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); Stopwatch timer = Stopwatch.StartNew(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); timer.Stop(); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("EllPack Cached Dot products with mainIndex {0} and {1}-vectors takes {2} ms stopwatch time {3} ms", mainIndex, N, naiveTime, timer.Elapsed); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(displayCount, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } cuda.Free(valsPtr); cuda.Free(idxPtr); cuda.Free(dOutput); cuda.DestroyArray(cuArr); cuda.DestroyTexture(cuTexRef); return output; }
private static void CuAddVec() { int N = 50000; uint size = (uint)N * sizeof(float); CUDA cuda = new CUDA(0, true); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction vecAddFunc = cuda.GetModuleFunction("VecAdd"); float[] A = new float[N]; float[] B = new float[N]; float[] C = new float[N]; for (int i = 0; i < A.Length; i++) { A[i] = (float)i; B[i] = (float)i + 0.1f; } CUdeviceptr dA = cuda.CopyHostToDevice(A); CUdeviceptr dB = cuda.CopyHostToDevice(B); CUdeviceptr dC = cuda.Allocate(A); int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; //error = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1); cuda.SetFunctionBlockShape(vecAddFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(vecAddFunc, offset, (uint)dA.Pointer); offset += IntPtr.Size; cuda.SetParameter(vecAddFunc, offset, (uint)dB.Pointer); offset += IntPtr.Size; cuda.SetParameter(vecAddFunc, offset, (uint)dC.Pointer); offset += IntPtr.Size; cuda.SetParameter(vecAddFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameterSize(vecAddFunc, (uint)offset); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); cuda.RecordEvent(start); cuda.Launch(vecAddFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("adding takes {0}ms", naiveTime); cuda.CopyDeviceToHost(dC, C); for (int i = 0; i < 10; i++) { Console.WriteLine("{0}-{1}", i, C[i]); } }
static void Main(string[] args) { // Create a new instance of CUDA class, select 1st device. CUDA cuda = new CUDA(0, true); // Prepare parameters. int n = 16 * 1024 * 1024; uint nbytes = (uint)(n * sizeof(int)); int value = 26; // allocate host memory int[] a = new int[n]; // allocate device memory CUdeviceptr d_a = cuda.Allocate<int>(a); CUDADriver.cuMemsetD8(d_a, 0xff, nbytes); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "asyncAPI.ptx")); CUfunction func = cuda.GetModuleFunction("increment_kernel"); // set kernel launch configuration cuda.SetFunctionBlockShape(func, 512, 1, 1); // create cuda event handles CUevent start = cuda.CreateEvent(); CUevent stop = cuda.CreateEvent(); // asynchronously issue work to the GPU (all to stream 0) CUstream stream = new CUstream(); cuda.RecordEvent(start); cuda.CopyHostToDeviceAsync<int>(d_a, a, stream); // set parameters for kernel function cuda.SetParameter(func, 0, (uint)d_a.Pointer); cuda.SetParameter(func, IntPtr.Size, (uint)value); cuda.SetParameterSize(func, (uint)(IntPtr.Size + 4)); // actually launch kernel cuda.LaunchAsync(func, n / 512, 1, stream); // wait for every thing to finish, then start copy back data cuda.CopyDeviceToHostAsync<int>(d_a, a, stream); cuda.RecordEvent(stop); // print the cpu and gpu times Console.WriteLine("time spent executing by the GPU: {0} ms", cuda.ElapsedTime(start, stop)); // check the output for correctness if (CorrectOutput(a, value)) Console.WriteLine("Test PASSED"); else Console.WriteLine("Test FAILED"); // release resources cuda.DestroyEvent(start); cuda.DestroyEvent(stop); cuda.Free(d_a); }
private void Worker() { #if CUDA int nN = _nMergingDeviceNumber; string sS = "CUDA-" + nN + "-" + _nIndex; try { Command cCmd; CUDA cCUDA; #region CUDA Init try { cCUDA = new CUDA(true); cCUDA.CreateContext(nN % 1000); // number of cuda in prefs (still alwais 0) } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } throw new Exception("CreateContext(" + nN % 1000 + ") error. Try to change CUDA's card number in prefs", ex); } (new Logger(sS)).WriteDebug("CreateContext(" + nN % 1000 + ") is ok!"); uint nMemoryReservedForMerge = 2 * 1024 * 1024; //PREFERENCES типа <memory reserved="2097152" /> uint nMemoryStarvationThreshold = cCUDA.TotalMemory / 2; //PREFERENCES через проценты... типа <memory starvation="50%" /> uint nMemoryFree; string sModule = "CUDAFunctions_" + Preferences.nCUDAVersion + "_x" + (IntPtr.Size * 8); if (Logger.bDebug) { (new Logger(sS)).WriteDebug(sModule + " Current CUDA = [name=" + cCUDA.CurrentDevice.Name + "][compute_capability=" + cCUDA.CurrentDevice.ComputeCapability + "]"); } cCUDA.LoadModule((byte[])Properties.Resource.ResourceManager.GetObject(sModule)); // $(ProjectDir)Resources\CUDAFunctions.cubin CUfunction cCUDAFuncMerge = cCUDA.GetModuleFunction("CUDAFrameMerge"); int nThreadsPerBlock = 16; //32 //256 //пришлось уменьшить с 512 до 256 сридов на блок, потому что при добавлении "движения" и операций с float, ловил ошибку: Too Many Resources Requested for Launch (This error means that the number of registers available on the multiprocessor is being exceeded. Reduce the number of threads per block to solve the problem) cCUDA.SetFunctionBlockShape(cCUDAFuncMerge, nThreadsPerBlock, nThreadsPerBlock, 1); CUDADriver.cuParamSetSize(cCUDAFuncMerge, 8); Dictionary <long, CUdeviceptr> ahPMIDs_DevicePointers = new Dictionary <long, CUdeviceptr>(); CUdeviceptr cPMs; CUdeviceptr cInfos; CUdeviceptr cAlphaMap; CUdeviceptr cAlphaMap_info3d; CUdeviceptr cAlphaMap_info2d; if (true) { //IntPtr[] aPointersByAlpha = new IntPtr[254]; //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds //IntPtr[] aPointersByBackground = new IntPtr[256]; // те самые массивы поинтеров B, т.е. BackGrounds byte[] aAlphaMap = new byte[(byte.MaxValue - 1) * (byte.MaxValue + 1) * (byte.MaxValue + 1)]; int[] aAlphaMap_info3d = new int[254]; // начала 2d слоёв ushort[] aAlphaMap_info2d = new ushort[256]; // начала строк в одном 2d int nResult, nIndx = 0, nIndxInfo = 0, nIndx2d = 0; for (byte nAlpha = 1; 255 > nAlpha; nAlpha++) { aAlphaMap_info3d[nIndxInfo++] = nIndx; for (ushort nBackground = 0; 256 > nBackground; nBackground++) { if (nAlpha == 1) { aAlphaMap_info2d[nIndx2d++] = (ushort)nIndx; } for (ushort nForeground = 0; 256 > nForeground; nForeground++) { if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5))) { nResult = 255; } aAlphaMap[nIndx++] = (byte)nResult; } //aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer; } //aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer; } cAlphaMap_info3d = cCUDA.CopyHostToDevice <int>(aAlphaMap_info3d); cAlphaMap = cCUDA.CopyHostToDevice <byte>(aAlphaMap); cAlphaMap_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap_info2d); } CUdeviceptr cAlphaMap2; CUdeviceptr cAlphaMap2_info2d; { byte[] aAlphaMap2 = new byte[(byte.MaxValue - 1) * (byte.MaxValue - 1)]; ushort[] aAlphaMap2_info2d = new ushort[254]; int nIndx = 0, nIndx2d = 0; for (byte nFGColorAlpha = 1; 255 > nFGColorAlpha; nFGColorAlpha++) // можно использовать симметрию умножения, но х с ней пока { aAlphaMap2_info2d[nIndx2d++] = (ushort)nIndx; for (byte nPixelAlpha = 1; 255 > nPixelAlpha; nPixelAlpha++) { aAlphaMap2[nIndx++] = (byte)((float)nFGColorAlpha * nPixelAlpha / 255 + 0.5); } } cAlphaMap2 = cCUDA.CopyHostToDevice <byte>(aAlphaMap2); cAlphaMap2_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap2_info2d); } CUdeviceptr cAlphaMap3; CUdeviceptr cAlphaMap3_info2d; { byte[] aAlphaMap3 = new byte[byte.MaxValue * (byte.MaxValue - 1)]; ushort[] aAlphaMap3_info2d = new ushort[255]; int nIndx = 0, nIndx2d = 0; for (ushort nFGColorAlpha = 1; 256 > nFGColorAlpha; nFGColorAlpha++) { aAlphaMap3_info2d[nIndx2d++] = (ushort)nIndx; for (byte nMask = 1; 255 > nMask; nMask++) { aAlphaMap3[nIndx++] = (byte)(nFGColorAlpha * ((255 - nMask) / 255f) + 0.5); } } cAlphaMap3 = cCUDA.CopyHostToDevice <byte>(aAlphaMap3); cAlphaMap3_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap3_info2d); } #endregion CUDA Init #if DEBUG Dictionary <long, DateTime> ahDebug = new Dictionary <long, DateTime>(); Dictionary <long, Area> ahDebugAr = new Dictionary <long, Area>(); #endif DateTime dtNextTime = DateTime.MinValue, dtNow; bool bSet; List <IntPtr> aDPs; List <PixelsMap> aPMs; while (true) { if (1 > aqQueue.CountGet() && (dtNow = DateTime.Now) > dtNextTime) { dtNextTime = dtNow.AddMinutes(20); #if DEBUG dtNow = dtNow.Subtract(TimeSpan.FromHours(2)); string sMessage = ""; foreach (long nID in ahDebug.OrderBy(o => o.Value).Select(o => o.Key)) { if (dtNow > ahDebug[nID]) { sMessage += "<br>[" + nID + " - " + ahDebug[nID].ToString("HH:mm:ss") + "]" + ahDebugAr[nID].ToString(); } } #endif (new Logger(sS)).WriteDebug("CUDA free memory:" + cCUDA.FreeMemory #if DEBUG + "; possibly timeworn allocations:" + (1 > sMessage.Length ? "no" : sMessage) #endif ); } cCmd = aqQueue.Dequeue(); //если нечего отдать - заснёт switch (cCmd.eID) { case Command.ID.Allocate: #region try { cCmd.cPM._cException = null; if (1 > cCmd.cPM._nID) { if (0 < cCmd.cPM._nBytesQty) { nMemoryFree = cCUDA.FreeMemory; if (nMemoryReservedForMerge < nMemoryFree - cCmd.cPM._nBytesQty) { bMemoryStarvation = (nMemoryFree < nMemoryStarvationThreshold); (new Logger(sS)).WriteDebug3("pixelmap allocateCUDA [current_id=" + _nCurrentID + "]"); cCmd.cPM._nID = System.Threading.Interlocked.Increment(ref _nCurrentID); ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.Allocate(cCmd.cPM._nBytesQty)); #if DEBUG ahDebug.Add(cCmd.cPM._nID, DateTime.Now); ahDebugAr.Add(cCmd.cPM._nID, cCmd.cPM.stArea); #endif } else { bMemoryStarvation = true; throw new Exception("out of memory in CUDA device during Allocate. Only 2 MBytes reserved for the Merge"); } } else { throw new Exception("bytes quantity in PixelsMap have to be greater than zero for Allocate [_bDisposed = " + cCmd.cPM._bDisposed + "][_bProcessing = " + cCmd.cPM._bProcessing + "][_stPosition.X = " + cCmd.cPM._stPosition.X + "][_stPosition.Y = " + cCmd.cPM._stPosition.Y + "][_bTemp = " + cCmd.cPM._bTemp + "][_dt = " + cCmd.cPM._dtCreate + "][_nBytesQty = " + cCmd.cPM._nBytesQty + "][_nID = " + cCmd.cPM._nID + "][_nShiftTotalX = " + cCmd.cPM._nShiftTotalX + "][_stArea.nHeight = " + cCmd.cPM._stArea.nHeight + "][_stArea.nWidth = " + cCmd.cPM._stArea.nWidth + "][bKeepAlive = " + cCmd.cPM.bKeepAlive + "][eAlpha = " + cCmd.cPM.eAlpha + "][bCUDA = " + cCmd.cPM.stMergingMethod + "][nAlphaConstant = " + cCmd.cPM.nAlphaConstant + "][nID = " + cCmd.cPM.nID + "][nLength = " + cCmd.cPM.nLength + "][stArea.nHeight = " + cCmd.cPM.stArea.nHeight + "][stArea.nWidth = " + cCmd.cPM.stArea.nWidth + "]"); } } else { throw new Exception("PixelsMap ID have to be zero for Allocate"); } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); (new Logger(sS)).WriteDebug("bytes qty:" + cCmd.cPM._nBytesQty); cCmd.cPM._cException = ex; } cCmd.cMRE.Set(); break; #endregion case Command.ID.CopyIn: #region try { cCmd.cPM._cException = null; if (1 > cCmd.cPM._nID) { if (cCUDA.FreeMemory - cCmd.cPM._nBytesQty > nMemoryReservedForMerge) { (new Logger(sS)).WriteDebug3("pixelmap copyinCUDA not allocated [pm_id=" + _nCurrentID + "]"); cCmd.cPM._nID = System.Threading.Interlocked.Increment(ref _nCurrentID); if (cCmd.ahParameters.ContainsKey(typeof(IntPtr))) { ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty)); } else if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) { ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((byte[])cCmd.ahParameters[typeof(byte[])])); } else { throw new Exception("unknown parameter type"); } #if DEBUG ahDebug.Add(cCmd.cPM._nID, DateTime.Now); ahDebugAr.Add(cCmd.cPM._nID, cCmd.cPM.stArea); #endif } else { throw new Exception("out of memory in CUDA device during CopyIn. Only 2 MBytes reserved for the Merge."); } } else { (new Logger(sS)).WriteDebug4("pixelmap copyinCUDA allocated [pm_id=" + _nCurrentID + "]"); if (cCmd.ahParameters.ContainsKey(typeof(IntPtr))) { cCUDA.CopyHostToDevice(ahPMIDs_DevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty); } else if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) { cCUDA.CopyHostToDevice(ahPMIDs_DevicePointers[cCmd.cPM._nID], (byte[])cCmd.ahParameters[typeof(byte[])]); } else { throw new Exception("unknown parameter type"); } } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); cCmd.cPM._cException = ex; } cCmd.cMRE.Set(); #endregion break; case Command.ID.CopyOut: #region try { if (0 < cCmd.cPM._nID) { if (!cCmd.ahParameters.ContainsKey(typeof(IntPtr))) { if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) { byte[] aB = (byte[])cCmd.ahParameters[typeof(byte[])]; cCmd.cPM._aBytes = null; if (cCmd.cPM._nBytesQty != aB.Length) { (new Logger(sS)).WriteWarning("wrong array size for copyout [got:" + aB.Length + "][expected:" + cCmd.cPM._nBytesQty + "]"); } cCUDA.CopyDeviceToHost <byte>(ahPMIDs_DevicePointers[cCmd.cPM._nID], aB); } else // не юзается (см. copyout()) { cCmd.cPM._aBytes = _cBinM.BytesGet((int)cCmd.cPM._nBytesQty, 3); cCUDA.CopyDeviceToHost <byte>(ahPMIDs_DevicePointers[cCmd.cPM._nID], cCmd.cPM._aBytes.aBytes); } } else { cCUDA.CopyDeviceToHost(ahPMIDs_DevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty); } (new Logger(sS)).WriteDebug5("copy out [id:" + cCmd.cPM._nID + "][ptr:" + ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer + "]"); } else { throw new Exception("PixelsMap have to be allocated for CopyOut"); } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); cCmd.cPM._cException = ex; } cCmd.cMRE.Set(); #endregion break; case Command.ID.Merge: #region bSet = false; try { aPMs = (List <PixelsMap>)cCmd.ahParameters[typeof(List <PixelsMap>)]; DisCom.MergeInfo cMergeInfo = (DisCom.MergeInfo)cCmd.ahParameters[typeof(DisCom.MergeInfo)]; aDPs = new List <IntPtr>(); if (1 > cCmd.cPM._nID) { throw new Exception("background PixelsMap have to be allocated for Merge"); } aDPs.Add((IntPtr)ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer); for (int nIndx = 0; nIndx < aPMs.Count; nIndx++) { if (!ahPMIDs_DevicePointers.ContainsKey(aPMs[nIndx]._nID)) { throw new Exception("there is a corrupted ID in layers for merge [id:" + aPMs[nIndx]._nID + "]"); } if (1 > ahPMIDs_DevicePointers[aPMs[nIndx]._nID].Pointer) { throw new Exception("there is an empty pointer in layers for merge [id:" + aPMs[nIndx]._nID + "]"); } aDPs.Add((IntPtr)ahPMIDs_DevicePointers[aPMs[nIndx]._nID].Pointer); } cPMs = cCUDA.CopyHostToDevice <IntPtr>(aDPs.ToArray()); cInfos = cCUDA.CopyHostToDevice(cMergeInfo, cMergeInfo.SizeGet()); // operator intptr in DisCom.MergeInfo cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, 0, (IntPtr)cPMs.Pointer); cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size, (IntPtr)cInfos.Pointer); cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 2, (IntPtr)cAlphaMap.Pointer); // cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 3, (IntPtr)cAlphaMap_info3d.Pointer); // cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 4, (IntPtr)cAlphaMap_info2d.Pointer); // cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 5, (IntPtr)cAlphaMap2.Pointer); cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 6, (IntPtr)cAlphaMap2_info2d.Pointer); // cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 7, (IntPtr)cAlphaMap3.Pointer); cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 8, (IntPtr)cAlphaMap3_info2d.Pointer); // cCUDA.SetParameterSize(cCUDAFuncMerge, (uint)(IntPtr.Size * 9)); int nIterationsX = (0 == cMergeInfo.nBackgroundWidth % nThreadsPerBlock ? cMergeInfo.nBackgroundWidth / nThreadsPerBlock : cMergeInfo.nBackgroundWidth / nThreadsPerBlock + 1); int nIterationsY = (0 == cMergeInfo.nBackgroundHight % nThreadsPerBlock ? cMergeInfo.nBackgroundHight / nThreadsPerBlock : cMergeInfo.nBackgroundHight / nThreadsPerBlock + 1); //int nIterationsX = (0 == cMergeInfo.nBackgroundHight % nThreadsPerBlock ? cMergeInfo.nBackgroundHight / nThreadsPerBlock : cMergeInfo.nBackgroundHight / nThreadsPerBlock + 1); cCUDA.Launch(cCUDAFuncMerge, nIterationsX, nIterationsY); cCUDA.Free(cPMs); cCUDA.Free(cInfos); cCmd.cMRE.Set(); bSet = true; cMergeInfo.Dispose(); for (int nIndx = 0; nIndx < aPMs.Count; nIndx++) { lock (aPMs[nIndx]._cSyncRoot) aPMs[nIndx]._bProcessing = false; aPMs[nIndx].Dispose(); } } catch (Exception ex) { cCmd.cPM._cException = ex; if (!bSet) { cCmd.cMRE.Set(); } if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); } #endregion break; case Command.ID.Dispose: #region (new Logger(sS)).Write(Logger.Level.debug4, "dispose: in"); try { if (ahPMIDs_DevicePointers.ContainsKey(cCmd.cPM._nID)) { if (0 < cCmd.cPM._nID && 0 < ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer) { cCUDA.Free(ahPMIDs_DevicePointers[cCmd.cPM._nID]); //cCUDA.SynchronizeContext(); bMemoryStarvation = (cCUDA.FreeMemory < nMemoryStarvationThreshold); (new Logger(sS)).WriteDebug3("dispose [id:" + cCmd.cPM._nID + "][ptr:" + ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer + "]"); } ahPMIDs_DevicePointers.Remove(cCmd.cPM._nID); #if DEBUG ahDebug.Remove(cCmd.cPM._nID); ahDebugAr.Remove(cCmd.cPM._nID); #endif cCmd.cPM._nID = 0; } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); cCmd.cPM._cException = ex; } (new Logger(sS)).Write(Logger.Level.debug4, "dispose: out"); #endregion break; } } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError("CUDA STOPPED!!!! [id = " + _nIndex + "]", ex); } #endif }
private static unsafe void CuStructPass() { int N = 4; int sparseVecSize = sizeof(SparseVecPtr); uint size = (uint)(N * sizeof(SparseVecPtr)); CUDA cuda = new CUDA(0, true); // load module cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "structKernel.cubin")); CUfunction structPassFunc = cuda.GetModuleFunction("StructPass"); SparseVecPtr[] vectors = new SparseVecPtr[N]; for (int i = 0; i < N; i++) { vectors[i] = new SparseVecPtr(); vectors[i].size = 2; float[] vals = new float[2] { (float)i + 1 % 5, (float)i + 2 % 7 }; //GCHandle valHandle = GCHandle.Alloc(vals, GCHandleType.Pinned); //vectors[i].values = valHandle.AddrOfPinnedObject(); int[] index = new int[2] { i % 5, i % 7 }; //GCHandle idxHandle = GCHandle.Alloc(index, GCHandleType.Pinned); //vectors[i].indices = idxHandle.AddrOfPinnedObject(); //valHandle.Free(); //idxHandle.Free(); CUdeviceptr valsPtr = cuda.CopyHostToDevice(vals); CUdeviceptr idxPtr = cuda.CopyHostToDevice(index); vectors[i].indices = new IntPtr(idxPtr.Pointer); vectors[i].values = (IntPtr)valsPtr.Pointer; } GCHandle handle = GCHandle.Alloc(vectors, GCHandleType.Pinned); IntPtr ptr = handle.AddrOfPinnedObject(); float[] output = new float[N]; //CUdeviceptr dVectors = cuda.CopyHostToDevice(vectors); CUdeviceptr dVectors = cuda.CopyHostToDevice(ptr, size); CUdeviceptr dOutput = cuda.Allocate(output); int threadsPerBlock = 256; int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock; //error = cuFuncSetBlockShape(vecAdd, threadsPerBlock, 1, 1); cuda.SetFunctionBlockShape(structPassFunc, threadsPerBlock, 1, 1); int offset = 0; cuda.SetParameter(structPassFunc, offset, (uint)dVectors.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)dOutput.Pointer); offset += IntPtr.Size; cuda.SetParameter(structPassFunc, offset, (uint)N); offset += sizeof(int); cuda.SetParameterSize(structPassFunc, (uint)offset); CUevent start = cuda.CreateEvent(); CUevent end = cuda.CreateEvent(); cuda.RecordEvent(start); cuda.Launch(structPassFunc, blocksPerGrid, 1); cuda.RecordEvent(end); cuda.SynchronizeContext(); //cuda.SynchronizeEvent(end); float naiveTime = cuda.ElapsedTime(start, end); Console.Write("passing struct takes {0}ms", naiveTime); cuda.CopyDeviceToHost(dOutput, output); int lenght = Math.Min(10, N); Console.WriteLine(); for (int i = 0; i < lenght; i++) { Console.WriteLine("{0}-{1}", i, output[i]); } }
//private double ComputeObj(float[] w, float[] alpha, Problem<SparseVec> sub_prob, float[] diag) //{ // double v = 0, v1=0; // int nSV = 0; // for (int i = 0; i < w.Length; i++) // { // v += w[i] * w[i]; // v1 += 0.5*w[i] * w[i]; // } // for (int i = 0; i < alpha.Length; i++) // { // sbyte y_i = (sbyte)sub_prob.Y[i]; // //original line // //v += alpha[i] * (alpha[i] * diag[GETI(y_i, i)] - 2); // v += alpha[i] * (alpha[i] * diag[y_i + 1] - 2); // v1 += 0.5* alpha[i] * (alpha[i] * diag[y_i + 1] - 2); // if (alpha[i] > 0) ++nSV; // } // v = v / 2; // // Debug.WriteLine("Objective value = {0}", v); // // Debug.WriteLine("nSV = {0}", nSV); // return v; //} protected void InitCudaModule() { cuda = new CUDA(0, true); cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName)); cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName); cuFuncSolver = cuda.GetModuleFunction(cudaSolveL2SVM); cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW); }
public CudaFunctionCall(CUDA cuda, string functionName) { _cuda = cuda; _function = cuda.GetModuleFunction(functionName); _offset = 0; }
private void InitCudaModule() { cuda = new CUDA(0, true); cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName)); cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName); cuFuncGradFinalize = cuda.GetModuleFunction(cudaGradFinalizeName); cuFuncComputeBBstep = cuda.GetModuleFunction(cudaComputeBBStepName); cuFuncObjSquareW = cuda.GetModuleFunction(cudaObjWName); cuFuncObjSquareAlpha = cuda.GetModuleFunction(cudaObjAlphaName); cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW); cuFuncUpdateAlpha = cuda.GetModuleFunction(cudaUpdateAlphaName); cuFuncMaxNorm = cuda.GetModuleFunction(cudaMaxNormName); }
unsafe public FlaCudaTask(CUDA _cuda, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify) { cuda = _cuda; residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames; bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames; samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount; int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames; int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames; int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FlaCudaWriter.maxFrames; cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2); cudaSamples = cuda.Allocate((uint)samplesBufferLen); cudaResidual = cuda.Allocate((uint)samplesBufferLen); cudaLPCData = cuda.Allocate((uint)lpcDataLen); cudaPartitions = cuda.Allocate((uint)partitionsLen); cudaRiceParams = cuda.Allocate((uint)riceParamsLen); cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames))); cudaResidualTasks = cuda.Allocate((uint)residualTasksLen); cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames)); CUResult cuErr = CUResult.Success; if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen); if (cuErr != CUResult.Success) { if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero; if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero; if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero; if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero; if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero; throw new CUDAException(cuErr); } cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr"); cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr"); cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2"); cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaQuantizeLPC = cuda.GetModuleFunction("cudaQuantizeLPC"); cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice"); cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8"); cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12"); cudaEstimateResidual1 = cuda.GetModuleFunction("cudaEstimateResidual1"); cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod"); cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod"); cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition"); cudaCalcPartition16 = cuda.GetModuleFunction("cudaCalcPartition16"); cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition"); cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition"); cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter"); cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder"); stream = cuda.CreateStream(); samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1]; frame = new FlacFrame(channelCount); frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length); if (do_verify) { verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100)); verify.DoCRC = false; } }