public override void Init() { cuda = new CUDA(0, true); var cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost); cuda.SetCurrentContext(cuCtx); cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName)); cuFunc = cuda.GetModuleFunction(cudaEvaluatorKernelName); cuFuncSign = cuda.GetModuleFunction(cudaSignKernelName); //reserved memory based on dimension of support vector //svVector = new float[TrainedModel.SupportElements[0].Count]; stream = cuda.CreateStream(); //memSvSize = (uint)(TrainedModel.SupportElements[0].Count * sizeof(float)); memSvSize = (uint)(TrainedModel.SupportElements[0].Dim * sizeof(float)); //allocates memory for buffers svVecIntPtrs[0] = cuda.AllocateHost(memSvSize); svVecIntPtrs[1] = cuda.AllocateHost(memSvSize); mainVecPtr = cuda.CopyHostToDeviceAsync(svVecIntPtrs[0], memSvSize, stream); cuSVTexRef = cuda.GetModuleTexture(cuModule, "svTexRef"); cuda.SetTextureFlags(cuSVTexRef, 0); cuda.SetTextureAddress(cuSVTexRef, mainVecPtr, memSvSize); //todo: copy labels and alphas float[] svLabels = new float[TrainedModel.SupportElements.Length]; float[] svAlphas = new float[TrainedModel.SupportElements.Length]; Parallel.For(0, TrainedModel.SupportElementsIndexes.Length, i => { int idx = TrainedModel.SupportElementsIndexes[i]; svLabels[i] = TrainedModel.Y[i]; //svLabels[i] = TrainningProblem.Labels[idx]; svAlphas[i] = TrainedModel.Alpha[idx]; }); //for (int i = 0; i < TrainedModel.SupportElementsIndexes.Length; i++) //{ // int idx = TrainedModel.SupportElementsIndexes[i]; // svLabels[i]= TrainningProblem.Labels[idx]; // svAlphas[i] = TrainedModel.Alpha[idx]; //} labelsPtr = cuda.CopyHostToDevice(svLabels); alphasPtr = cuda.CopyHostToDevice(svAlphas); IsInitialized = true; }
private void InitCuda() { cuda = new CUDA(0, true); var cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost); cuda.SetCurrentContext(cuCtx); cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName)); cuFuncEval = cuda.GetModuleFunction(cudaEvaluatorKernelName); cuFuncReduce = cuda.GetModuleFunction(cudaReduceKernelName); }
protected void InitCudaModule() { int deviceNr = 0; cuda = new CUDA(deviceNr, true); cuCtx = cuda.CreateContext(deviceNr, CUCtxFlags.MapHost); string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName); if (!File.Exists(modluePath)) { throw new ArgumentException("Failed access to cuda module" + modluePath); } cuModule = cuda.LoadModule(modluePath); cuFunc = cuda.GetModuleFunction(cudaProductKernelName); }
private void InitCudaModule() { cufy.CudafyModes.Target = cufy.eGPUType.Cuda; gpu = CudafyHost.GetDevice(CudafyModes.Target); cuGPU = (CUDA)((CudaGPU)gpu).CudaDotNet; var ctx = cuGPU.CreateContext(0, CUCtxFlags.MapHost); cuGPU.SetCurrentContext(ctx); // gpu.EnableSmartCopy(); module = CudafyModule.TryDeserialize(moduleName); if (module == null || !module.TryVerifyChecksums()) { module = CudafyTranslator.Cudafy(typeof(CudafyRBFSlicedEllpackKernel)); module.Serialize(); } gpu.LoadModule(module); }
private void Worker() { #if CUDA int nN = _nMergingDeviceNumber; string sS = "CUDA-" + nN + "-" + _nIndex; try { Command cCmd; CUDA cCUDA; #region CUDA Init try { cCUDA = new CUDA(true); cCUDA.CreateContext(nN % 1000); // number of cuda in prefs (still alwais 0) } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } throw new Exception("CreateContext(" + nN % 1000 + ") error. Try to change CUDA's card number in prefs", ex); } (new Logger(sS)).WriteDebug("CreateContext(" + nN % 1000 + ") is ok!"); uint nMemoryReservedForMerge = 2 * 1024 * 1024; //PREFERENCES типа <memory reserved="2097152" /> uint nMemoryStarvationThreshold = cCUDA.TotalMemory / 2; //PREFERENCES через проценты... типа <memory starvation="50%" /> uint nMemoryFree; string sModule = "CUDAFunctions_" + Preferences.nCUDAVersion + "_x" + (IntPtr.Size * 8); if (Logger.bDebug) { (new Logger(sS)).WriteDebug(sModule + " Current CUDA = [name=" + cCUDA.CurrentDevice.Name + "][compute_capability=" + cCUDA.CurrentDevice.ComputeCapability + "]"); } cCUDA.LoadModule((byte[])Properties.Resource.ResourceManager.GetObject(sModule)); // $(ProjectDir)Resources\CUDAFunctions.cubin CUfunction cCUDAFuncMerge = cCUDA.GetModuleFunction("CUDAFrameMerge"); int nThreadsPerBlock = 16; //32 //256 //пришлось уменьшить с 512 до 256 сридов на блок, потому что при добавлении "движения" и операций с float, ловил ошибку: Too Many Resources Requested for Launch (This error means that the number of registers available on the multiprocessor is being exceeded. Reduce the number of threads per block to solve the problem) cCUDA.SetFunctionBlockShape(cCUDAFuncMerge, nThreadsPerBlock, nThreadsPerBlock, 1); CUDADriver.cuParamSetSize(cCUDAFuncMerge, 8); Dictionary <long, CUdeviceptr> ahPMIDs_DevicePointers = new Dictionary <long, CUdeviceptr>(); CUdeviceptr cPMs; CUdeviceptr cInfos; CUdeviceptr cAlphaMap; CUdeviceptr cAlphaMap_info3d; CUdeviceptr cAlphaMap_info2d; if (true) { //IntPtr[] aPointersByAlpha = new IntPtr[254]; //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds //IntPtr[] aPointersByBackground = new IntPtr[256]; // те самые массивы поинтеров B, т.е. BackGrounds byte[] aAlphaMap = new byte[(byte.MaxValue - 1) * (byte.MaxValue + 1) * (byte.MaxValue + 1)]; int[] aAlphaMap_info3d = new int[254]; // начала 2d слоёв ushort[] aAlphaMap_info2d = new ushort[256]; // начала строк в одном 2d int nResult, nIndx = 0, nIndxInfo = 0, nIndx2d = 0; for (byte nAlpha = 1; 255 > nAlpha; nAlpha++) { aAlphaMap_info3d[nIndxInfo++] = nIndx; for (ushort nBackground = 0; 256 > nBackground; nBackground++) { if (nAlpha == 1) { aAlphaMap_info2d[nIndx2d++] = (ushort)nIndx; } for (ushort nForeground = 0; 256 > nForeground; nForeground++) { if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5))) { nResult = 255; } aAlphaMap[nIndx++] = (byte)nResult; } //aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer; } //aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer; } cAlphaMap_info3d = cCUDA.CopyHostToDevice <int>(aAlphaMap_info3d); cAlphaMap = cCUDA.CopyHostToDevice <byte>(aAlphaMap); cAlphaMap_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap_info2d); } CUdeviceptr cAlphaMap2; CUdeviceptr cAlphaMap2_info2d; { byte[] aAlphaMap2 = new byte[(byte.MaxValue - 1) * (byte.MaxValue - 1)]; ushort[] aAlphaMap2_info2d = new ushort[254]; int nIndx = 0, nIndx2d = 0; for (byte nFGColorAlpha = 1; 255 > nFGColorAlpha; nFGColorAlpha++) // можно использовать симметрию умножения, но х с ней пока { aAlphaMap2_info2d[nIndx2d++] = (ushort)nIndx; for (byte nPixelAlpha = 1; 255 > nPixelAlpha; nPixelAlpha++) { aAlphaMap2[nIndx++] = (byte)((float)nFGColorAlpha * nPixelAlpha / 255 + 0.5); } } cAlphaMap2 = cCUDA.CopyHostToDevice <byte>(aAlphaMap2); cAlphaMap2_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap2_info2d); } CUdeviceptr cAlphaMap3; CUdeviceptr cAlphaMap3_info2d; { byte[] aAlphaMap3 = new byte[byte.MaxValue * (byte.MaxValue - 1)]; ushort[] aAlphaMap3_info2d = new ushort[255]; int nIndx = 0, nIndx2d = 0; for (ushort nFGColorAlpha = 1; 256 > nFGColorAlpha; nFGColorAlpha++) { aAlphaMap3_info2d[nIndx2d++] = (ushort)nIndx; for (byte nMask = 1; 255 > nMask; nMask++) { aAlphaMap3[nIndx++] = (byte)(nFGColorAlpha * ((255 - nMask) / 255f) + 0.5); } } cAlphaMap3 = cCUDA.CopyHostToDevice <byte>(aAlphaMap3); cAlphaMap3_info2d = cCUDA.CopyHostToDevice <ushort>(aAlphaMap3_info2d); } #endregion CUDA Init #if DEBUG Dictionary <long, DateTime> ahDebug = new Dictionary <long, DateTime>(); Dictionary <long, Area> ahDebugAr = new Dictionary <long, Area>(); #endif DateTime dtNextTime = DateTime.MinValue, dtNow; bool bSet; List <IntPtr> aDPs; List <PixelsMap> aPMs; while (true) { if (1 > aqQueue.CountGet() && (dtNow = DateTime.Now) > dtNextTime) { dtNextTime = dtNow.AddMinutes(20); #if DEBUG dtNow = dtNow.Subtract(TimeSpan.FromHours(2)); string sMessage = ""; foreach (long nID in ahDebug.OrderBy(o => o.Value).Select(o => o.Key)) { if (dtNow > ahDebug[nID]) { sMessage += "<br>[" + nID + " - " + ahDebug[nID].ToString("HH:mm:ss") + "]" + ahDebugAr[nID].ToString(); } } #endif (new Logger(sS)).WriteDebug("CUDA free memory:" + cCUDA.FreeMemory #if DEBUG + "; possibly timeworn allocations:" + (1 > sMessage.Length ? "no" : sMessage) #endif ); } cCmd = aqQueue.Dequeue(); //если нечего отдать - заснёт switch (cCmd.eID) { case Command.ID.Allocate: #region try { cCmd.cPM._cException = null; if (1 > cCmd.cPM._nID) { if (0 < cCmd.cPM._nBytesQty) { nMemoryFree = cCUDA.FreeMemory; if (nMemoryReservedForMerge < nMemoryFree - cCmd.cPM._nBytesQty) { bMemoryStarvation = (nMemoryFree < nMemoryStarvationThreshold); (new Logger(sS)).WriteDebug3("pixelmap allocateCUDA [current_id=" + _nCurrentID + "]"); cCmd.cPM._nID = System.Threading.Interlocked.Increment(ref _nCurrentID); ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.Allocate(cCmd.cPM._nBytesQty)); #if DEBUG ahDebug.Add(cCmd.cPM._nID, DateTime.Now); ahDebugAr.Add(cCmd.cPM._nID, cCmd.cPM.stArea); #endif } else { bMemoryStarvation = true; throw new Exception("out of memory in CUDA device during Allocate. Only 2 MBytes reserved for the Merge"); } } else { throw new Exception("bytes quantity in PixelsMap have to be greater than zero for Allocate [_bDisposed = " + cCmd.cPM._bDisposed + "][_bProcessing = " + cCmd.cPM._bProcessing + "][_stPosition.X = " + cCmd.cPM._stPosition.X + "][_stPosition.Y = " + cCmd.cPM._stPosition.Y + "][_bTemp = " + cCmd.cPM._bTemp + "][_dt = " + cCmd.cPM._dtCreate + "][_nBytesQty = " + cCmd.cPM._nBytesQty + "][_nID = " + cCmd.cPM._nID + "][_nShiftTotalX = " + cCmd.cPM._nShiftTotalX + "][_stArea.nHeight = " + cCmd.cPM._stArea.nHeight + "][_stArea.nWidth = " + cCmd.cPM._stArea.nWidth + "][bKeepAlive = " + cCmd.cPM.bKeepAlive + "][eAlpha = " + cCmd.cPM.eAlpha + "][bCUDA = " + cCmd.cPM.stMergingMethod + "][nAlphaConstant = " + cCmd.cPM.nAlphaConstant + "][nID = " + cCmd.cPM.nID + "][nLength = " + cCmd.cPM.nLength + "][stArea.nHeight = " + cCmd.cPM.stArea.nHeight + "][stArea.nWidth = " + cCmd.cPM.stArea.nWidth + "]"); } } else { throw new Exception("PixelsMap ID have to be zero for Allocate"); } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); (new Logger(sS)).WriteDebug("bytes qty:" + cCmd.cPM._nBytesQty); cCmd.cPM._cException = ex; } cCmd.cMRE.Set(); break; #endregion case Command.ID.CopyIn: #region try { cCmd.cPM._cException = null; if (1 > cCmd.cPM._nID) { if (cCUDA.FreeMemory - cCmd.cPM._nBytesQty > nMemoryReservedForMerge) { (new Logger(sS)).WriteDebug3("pixelmap copyinCUDA not allocated [pm_id=" + _nCurrentID + "]"); cCmd.cPM._nID = System.Threading.Interlocked.Increment(ref _nCurrentID); if (cCmd.ahParameters.ContainsKey(typeof(IntPtr))) { ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty)); } else if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) { ahPMIDs_DevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((byte[])cCmd.ahParameters[typeof(byte[])])); } else { throw new Exception("unknown parameter type"); } #if DEBUG ahDebug.Add(cCmd.cPM._nID, DateTime.Now); ahDebugAr.Add(cCmd.cPM._nID, cCmd.cPM.stArea); #endif } else { throw new Exception("out of memory in CUDA device during CopyIn. Only 2 MBytes reserved for the Merge."); } } else { (new Logger(sS)).WriteDebug4("pixelmap copyinCUDA allocated [pm_id=" + _nCurrentID + "]"); if (cCmd.ahParameters.ContainsKey(typeof(IntPtr))) { cCUDA.CopyHostToDevice(ahPMIDs_DevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty); } else if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) { cCUDA.CopyHostToDevice(ahPMIDs_DevicePointers[cCmd.cPM._nID], (byte[])cCmd.ahParameters[typeof(byte[])]); } else { throw new Exception("unknown parameter type"); } } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); cCmd.cPM._cException = ex; } cCmd.cMRE.Set(); #endregion break; case Command.ID.CopyOut: #region try { if (0 < cCmd.cPM._nID) { if (!cCmd.ahParameters.ContainsKey(typeof(IntPtr))) { if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) { byte[] aB = (byte[])cCmd.ahParameters[typeof(byte[])]; cCmd.cPM._aBytes = null; if (cCmd.cPM._nBytesQty != aB.Length) { (new Logger(sS)).WriteWarning("wrong array size for copyout [got:" + aB.Length + "][expected:" + cCmd.cPM._nBytesQty + "]"); } cCUDA.CopyDeviceToHost <byte>(ahPMIDs_DevicePointers[cCmd.cPM._nID], aB); } else // не юзается (см. copyout()) { cCmd.cPM._aBytes = _cBinM.BytesGet((int)cCmd.cPM._nBytesQty, 3); cCUDA.CopyDeviceToHost <byte>(ahPMIDs_DevicePointers[cCmd.cPM._nID], cCmd.cPM._aBytes.aBytes); } } else { cCUDA.CopyDeviceToHost(ahPMIDs_DevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty); } (new Logger(sS)).WriteDebug5("copy out [id:" + cCmd.cPM._nID + "][ptr:" + ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer + "]"); } else { throw new Exception("PixelsMap have to be allocated for CopyOut"); } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); cCmd.cPM._cException = ex; } cCmd.cMRE.Set(); #endregion break; case Command.ID.Merge: #region bSet = false; try { aPMs = (List <PixelsMap>)cCmd.ahParameters[typeof(List <PixelsMap>)]; DisCom.MergeInfo cMergeInfo = (DisCom.MergeInfo)cCmd.ahParameters[typeof(DisCom.MergeInfo)]; aDPs = new List <IntPtr>(); if (1 > cCmd.cPM._nID) { throw new Exception("background PixelsMap have to be allocated for Merge"); } aDPs.Add((IntPtr)ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer); for (int nIndx = 0; nIndx < aPMs.Count; nIndx++) { if (!ahPMIDs_DevicePointers.ContainsKey(aPMs[nIndx]._nID)) { throw new Exception("there is a corrupted ID in layers for merge [id:" + aPMs[nIndx]._nID + "]"); } if (1 > ahPMIDs_DevicePointers[aPMs[nIndx]._nID].Pointer) { throw new Exception("there is an empty pointer in layers for merge [id:" + aPMs[nIndx]._nID + "]"); } aDPs.Add((IntPtr)ahPMIDs_DevicePointers[aPMs[nIndx]._nID].Pointer); } cPMs = cCUDA.CopyHostToDevice <IntPtr>(aDPs.ToArray()); cInfos = cCUDA.CopyHostToDevice(cMergeInfo, cMergeInfo.SizeGet()); // operator intptr in DisCom.MergeInfo cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, 0, (IntPtr)cPMs.Pointer); cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size, (IntPtr)cInfos.Pointer); cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 2, (IntPtr)cAlphaMap.Pointer); // cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 3, (IntPtr)cAlphaMap_info3d.Pointer); // cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 4, (IntPtr)cAlphaMap_info2d.Pointer); // cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 5, (IntPtr)cAlphaMap2.Pointer); cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 6, (IntPtr)cAlphaMap2_info2d.Pointer); // cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 7, (IntPtr)cAlphaMap3.Pointer); cCUDA.SetParameter <IntPtr>(cCUDAFuncMerge, IntPtr.Size * 8, (IntPtr)cAlphaMap3_info2d.Pointer); // cCUDA.SetParameterSize(cCUDAFuncMerge, (uint)(IntPtr.Size * 9)); int nIterationsX = (0 == cMergeInfo.nBackgroundWidth % nThreadsPerBlock ? cMergeInfo.nBackgroundWidth / nThreadsPerBlock : cMergeInfo.nBackgroundWidth / nThreadsPerBlock + 1); int nIterationsY = (0 == cMergeInfo.nBackgroundHight % nThreadsPerBlock ? cMergeInfo.nBackgroundHight / nThreadsPerBlock : cMergeInfo.nBackgroundHight / nThreadsPerBlock + 1); //int nIterationsX = (0 == cMergeInfo.nBackgroundHight % nThreadsPerBlock ? cMergeInfo.nBackgroundHight / nThreadsPerBlock : cMergeInfo.nBackgroundHight / nThreadsPerBlock + 1); cCUDA.Launch(cCUDAFuncMerge, nIterationsX, nIterationsY); cCUDA.Free(cPMs); cCUDA.Free(cInfos); cCmd.cMRE.Set(); bSet = true; cMergeInfo.Dispose(); for (int nIndx = 0; nIndx < aPMs.Count; nIndx++) { lock (aPMs[nIndx]._cSyncRoot) aPMs[nIndx]._bProcessing = false; aPMs[nIndx].Dispose(); } } catch (Exception ex) { cCmd.cPM._cException = ex; if (!bSet) { cCmd.cMRE.Set(); } if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); } #endregion break; case Command.ID.Dispose: #region (new Logger(sS)).Write(Logger.Level.debug4, "dispose: in"); try { if (ahPMIDs_DevicePointers.ContainsKey(cCmd.cPM._nID)) { if (0 < cCmd.cPM._nID && 0 < ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer) { cCUDA.Free(ahPMIDs_DevicePointers[cCmd.cPM._nID]); //cCUDA.SynchronizeContext(); bMemoryStarvation = (cCUDA.FreeMemory < nMemoryStarvationThreshold); (new Logger(sS)).WriteDebug3("dispose [id:" + cCmd.cPM._nID + "][ptr:" + ahPMIDs_DevicePointers[cCmd.cPM._nID].Pointer + "]"); } ahPMIDs_DevicePointers.Remove(cCmd.cPM._nID); #if DEBUG ahDebug.Remove(cCmd.cPM._nID); ahDebugAr.Remove(cCmd.cPM._nID); #endif cCmd.cPM._nID = 0; } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError(ex); cCmd.cPM._cException = ex; } (new Logger(sS)).Write(Logger.Level.debug4, "dispose: out"); #endregion break; } } } catch (Exception ex) { if (ex is CUDAException) { ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); } (new Logger(sS)).WriteError("CUDA STOPPED!!!! [id = " + _nIndex + "]", ex); } #endif }
static private void Worker(object cState) { try { Command cCmd; CUDA cCUDA = new CUDA(true); for (int i = 0; i < 10; i++) { try { cCUDA.CreateContext(i); (new Logger()).WriteDebug2(i + ": success"); break; } catch (Exception ex) { (new Logger()).WriteDebug2(i + ": failed"); if (Logger.bDebug && Logger.Level.debug3 > Logger.eLevelMinimum) (new Logger()).WriteError(ex); } } uint nMemoryReservedForMerge = 2 * 1024 * 1024; //PREFERENCES типа <memory reserved="2097152" /> uint nMemoryStarvationThreshold = cCUDA.TotalMemory / 2; //PREFERENCES через проценты... типа <memory starvation="50%" /> uint nMemoryFree; string sModule = "CUDAFunctions_" + Preferences.nCUDAVersion + "_x" + (IntPtr.Size * 8); if (Logger.bDebug) (new Logger()).WriteDebug3(sModule); cCUDA.LoadModule((byte[])Properties.Resource.ResourceManager.GetObject(sModule)); // $(ProjectDir)Resources\CUDAFunctions.cubin //cCUDA.LoadModule(@"c:\projects\!helpers\video\PixelsMap\Resources\CUDAFunctions.cubin"); CUfunction cCUDAFuncMerge = cCUDA.GetModuleFunction("CUDAFrameMerge"); int nThreadsPerBlock = 256; //пришлось уменьшить с 512 до 256 сридов на блок, потому что при добавлении "движения" и операций с float, ловил ошибку: Too Many Resources Requested for Launch (This error means that the number of registers available on the multiprocessor is being exceeded. Reduce the number of threads per block to solve the problem) cCUDA.SetFunctionBlockShape(cCUDAFuncMerge, nThreadsPerBlock, 1, 1); CUDADriver.cuParamSetSize(cCUDAFuncMerge, 8); Dictionary<ulong, CUdeviceptr> ahDevicePointers = new Dictionary<ulong, CUdeviceptr>(); CUdeviceptr cPMs; CUdeviceptr cInfos; CUdeviceptr cAlphaMap; { //IntPtr[] aPointersByAlpha = new IntPtr[254]; //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds //IntPtr[] aPointersByBackground = new IntPtr[256]; // те самые массивы поинтеров B, т.е. BackGrounds byte[] aAlphaMap = new byte[16646144]; int nResult, nIndx = 0; for (byte nAlpha = 1; 255 > nAlpha; nAlpha++) { for (ushort nBackground = 0; 256 > nBackground; nBackground++) { for (ushort nForeground = 0; 256 > nForeground; nForeground++) { if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5))) nResult = 255; aAlphaMap[nIndx++] = (byte)nResult; } //aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer; } //aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer; } cAlphaMap = cCUDA.CopyHostToDevice<byte>(aAlphaMap); } //{ // IntPtr[] aPointersByAlpha = new IntPtr[254]; //те самые поинтеры-альфы. Ссылаются на массивы поинтеров B, т.е. BackGrounds // IntPtr[] aPointersByBackground = new IntPtr[256]; // те самые массивы поинтеров B, т.е. BackGrounds // byte[] aResults = new byte[256]; // int nResult; // for (byte nAlpha = 1; 255 > nAlpha; nAlpha++) // { // for (ushort nBackground = 0; 256 > nBackground; nBackground++) // { // for (ushort nForeground = 0; 256 > nForeground; nForeground++) // { // if (255 < (nResult = (int)((float)(nAlpha * (nForeground - nBackground)) / 255 + nBackground + 0.5))) // nResult = 255; // aResults[nForeground] = (byte)nResult; // } // aPointersByBackground[nBackground] = (IntPtr)cCUDA.CopyHostToDevice<byte>(aResults).Pointer; // } // aPointersByAlpha[nAlpha - 1] = (IntPtr)cCUDA.CopyHostToDevice<IntPtr>(aPointersByBackground).Pointer; // } // cAlphaMap = cCUDA.CopyHostToDevice<IntPtr>(aPointersByAlpha); //} #if DEBUG Dictionary<ulong, DateTime> ahDebug = new Dictionary<ulong,DateTime>(); #endif DateTime dtNextTime = DateTime.MinValue, dtNow; long nStartTick; // logging while (true) { if (1 > _aqCommands.CountGet() && (dtNow = DateTime.Now) > dtNextTime) { dtNextTime = dtNow.AddSeconds(60); #if DEBUG dtNow = dtNow.Subtract(TimeSpan.FromHours(2)); string sMessage = ""; foreach (ulong nID in ahDebug.Keys) if (dtNow > ahDebug[nID]) sMessage += "<br>[" + nID + ":" + ahDebug[nID].ToString("HH:mm:ss") + "]"; #endif (new Logger()).WriteDebug("CUDA free memory:" + cCUDA.FreeMemory #if DEBUG + "; possibly timeworn allocations:" + (1 > sMessage.Length ? "no" : sMessage) #endif ); } while (true) { try { cCmd = _aqCommands.Dequeue(); //если нечего отдать - заснёт break; } catch (Exception ex) { (new Logger()).WriteError(ex); } } _CommandsCount = _aqCommands.nCount; switch (cCmd.eID) { case Command.ID.Allocate: #region try { cCmd.cPM._cException = null; if (1 > cCmd.cPM._nID) { if (0 < cCmd.cPM._nBytesQty) { nMemoryFree = cCUDA.FreeMemory; if (nMemoryReservedForMerge < nMemoryFree - cCmd.cPM._nBytesQty) { bMemoryStarvation = (nMemoryFree < nMemoryStarvationThreshold); cCmd.cPM._nID = _nCurrentID++; ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.Allocate(cCmd.cPM._nBytesQty)); #if DEBUG ahDebug.Add(cCmd.cPM._nID, DateTime.Now); #endif } else { bMemoryStarvation = true; throw new Exception("out of memory in CUDA device during Allocate. Only 2 MBytes reserved for the Merge"); } } else throw new Exception("bytes quantity in PixelsMap have to be greater than zero for Allocate [_bDisposed = " + cCmd.cPM._bDisposed + "][_bProcessing = " + cCmd.cPM._bProcessing + "][_bShiftVertical = " + cCmd.cPM._bShiftVertical + "][_bTemp = " + cCmd.cPM._bTemp + "][_dt = " + cCmd.cPM._dt + "][_nBytesQty = " + cCmd.cPM._nBytesQty + "][_nID = " + cCmd.cPM._nID + "][_nShiftPosition = " + cCmd.cPM._nShiftPosition + "][_stArea.nHeight = " + cCmd.cPM._stArea.nHeight + "][_stArea.nWidth = " + cCmd.cPM._stArea.nWidth + "][bKeepAlive = " + cCmd.cPM.bKeepAlive + "][bBackgroundClear = " + cCmd.cPM.bBackgroundClear + "][eAlpha = " + cCmd.cPM.eAlpha + "][bCUDA = " + cCmd.cPM.bCUDA + "][nAlphaConstant = " + cCmd.cPM.nAlphaConstant + "][nID = " + cCmd.cPM.nID + "][nLength = " + cCmd.cPM.nLength + "][stArea.nHeight = " + cCmd.cPM.stArea.nHeight + "][stArea.nWidth = " + cCmd.cPM.stArea.nWidth + "]"); } else throw new Exception("PixelsMap ID have to be zero for Allocate"); } catch (Exception ex) { if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); (new Logger()).WriteDebug("bytes qty:" + cCmd.cPM._nBytesQty); cCmd.cPM._cException = ex; } cCmd.cMRE.Set(); break; #endregion case Command.ID.CopyIn: #region nStartTick = DateTime.Now.Ticks; // logging try { cCmd.cPM._cException = null; if (1 > cCmd.cPM._nID) { if (cCUDA.FreeMemory - cCmd.cPM._nBytesQty > nMemoryReservedForMerge) { cCmd.cPM._nID = _nCurrentID++; if (cCmd.ahParameters.ContainsKey(typeof(IntPtr))) ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty)); else if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) ahDevicePointers.Add(cCmd.cPM._nID, cCUDA.CopyHostToDevice((byte[])cCmd.ahParameters[typeof(byte[])])); else throw new Exception("unknown parameter type"); #if DEBUG ahDebug.Add(cCmd.cPM._nID, DateTime.Now); #endif } else throw new Exception("out of memory in CUDA device during CopyIn. Only 2 MBytes reserved for the Merge."); } else { if (cCmd.ahParameters.ContainsKey(typeof(IntPtr))) cCUDA.CopyHostToDevice(ahDevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty); else if (cCmd.ahParameters.ContainsKey(typeof(byte[]))) cCUDA.CopyHostToDevice(ahDevicePointers[cCmd.cPM._nID], (byte[])cCmd.ahParameters[typeof(byte[])]); else throw new Exception("unknown parameter type"); } if (ahDevicePointers.ContainsKey(cCmd.cPM._nID)) (new Logger()).WriteDebug5("copy in [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]"); else (new Logger()).WriteDebug5("copy in [id:" + cCmd.cPM._nID + "][ptr: not in dictionary]"); } catch (Exception ex) { if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); cCmd.cPM._cException = ex; } if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20) // logging (new Logger()).WriteNotice("PixelMap: Command.ID.CopyIn: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms"); // logging cCmd.cMRE.Set(); break; #endregion case Command.ID.CopyOut: #region nStartTick = DateTime.Now.Ticks; // logging try { if (0 < cCmd.cPM._nID) { if(!cCmd.ahParameters.ContainsKey(typeof(IntPtr))) { if(cCmd.ahParameters.ContainsKey(typeof(byte[]))) { cCmd.cPM._aBytes = (byte[])cCmd.ahParameters[typeof(byte[])]; if(cCmd.cPM._nBytesQty != cCmd.cPM._aBytes.Length) (new Logger()).WriteWarning("wrong array size for copyout [got:" + cCmd.cPM._aBytes.Length + "][expected:" + cCmd.cPM._nBytesQty + "]"); } else cCmd.cPM._aBytes = new byte[cCmd.cPM._nBytesQty]; cCUDA.CopyDeviceToHost<byte>(ahDevicePointers[cCmd.cPM._nID], cCmd.cPM._aBytes); } else cCUDA.CopyDeviceToHost(ahDevicePointers[cCmd.cPM._nID], (IntPtr)cCmd.ahParameters[typeof(IntPtr)], cCmd.cPM._nBytesQty); (new Logger()).WriteDebug5("copy out [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]"); } else throw new Exception("PixelsMap have to be allocated for CopyOut"); } catch (Exception ex) { if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); cCmd.cPM._cException = ex; } if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20) // logging (new Logger()).WriteNotice("PixelMap: Command.ID.CopyOut: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms"); // logging cCmd.cMRE.Set(); break; #endregion case Command.ID.Merge: #region try { List<PixelsMap> aPMs = (List<PixelsMap>)cCmd.ahParameters[typeof(List<PixelsMap>)]; DisCom.MergeInfo cMergeInfo = (DisCom.MergeInfo)cCmd.ahParameters[typeof(DisCom.MergeInfo)]; List<IntPtr> aDPs = new List<IntPtr>(); if (1 > cCmd.cPM._nID) throw new Exception("background PixelsMap have to be allocated for Merge"); aDPs.Add((IntPtr)ahDevicePointers[cCmd.cPM._nID].Pointer); for (int nIndx = 0; nIndx < aPMs.Count; nIndx++) { if (!ahDevicePointers.ContainsKey(aPMs[nIndx]._nID)) throw new Exception("there is a corrupted ID in layers for merge [id:" + aPMs[nIndx]._nID + "]"); if (1 > ahDevicePointers[aPMs[nIndx]._nID].Pointer) throw new Exception("there is an empty pointer in layers for merge [id:" + aPMs[nIndx]._nID + "]"); aDPs.Add((IntPtr)ahDevicePointers[aPMs[nIndx]._nID].Pointer); } cPMs = cCUDA.CopyHostToDevice<IntPtr>(aDPs.ToArray()); cInfos = cCUDA.CopyHostToDevice(cMergeInfo, cMergeInfo.SizeGet()); cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, 0, (IntPtr)cPMs.Pointer); cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, IntPtr.Size, (IntPtr)cInfos.Pointer); cCUDA.SetParameter<IntPtr>(cCUDAFuncMerge, IntPtr.Size * 2, (IntPtr)cAlphaMap.Pointer); cCUDA.SetParameterSize(cCUDAFuncMerge, (uint)(IntPtr.Size * 3)); int nIterations = (0 == cMergeInfo.nBackgroundSize % nThreadsPerBlock ? cMergeInfo.nBackgroundSize / nThreadsPerBlock : cMergeInfo.nBackgroundSize / nThreadsPerBlock + 1); cCUDA.Launch(cCUDAFuncMerge, nIterations, 1); cCmd.cMRE.Set(); cMergeInfo.Dispose(); cCUDA.Free(cPMs); cCUDA.Free(cInfos); for (int nIndx = 0; nIndx < aPMs.Count; nIndx++) { lock (aPMs[nIndx]._cSyncRoot) aPMs[nIndx]._bProcessing = false; aPMs[nIndx].Dispose(); } } catch (Exception ex) { cCmd.cMRE.Set(); if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); cCmd.cPM._cException = ex; } break; #endregion case Command.ID.Dispose: #region nStartTick = DateTime.Now.Ticks; // logging (new Logger()).Write(Logger.Level.debug2, "dispose: in"); try { if (ahDevicePointers.ContainsKey(cCmd.cPM._nID)) { if (0 < cCmd.cPM._nID && 0 < ahDevicePointers[cCmd.cPM._nID].Pointer) { cCUDA.Free(ahDevicePointers[cCmd.cPM._nID]); //cCUDA.SynchronizeContext(); bMemoryStarvation = (cCUDA.FreeMemory < nMemoryStarvationThreshold); (new Logger()).WriteDebug3("dispose [id:" + cCmd.cPM._nID + "][ptr:" + ahDevicePointers[cCmd.cPM._nID].Pointer + "]"); } ahDevicePointers.Remove(cCmd.cPM._nID); #if DEBUG ahDebug.Remove(cCmd.cPM._nID); #endif cCmd.cPM._nID = 0; } } catch (Exception ex) { if (ex is CUDAException) ex = new Exception("CUDA Error:" + ((CUDAException)ex).CUDAError.ToString(), ex); (new Logger()).WriteError(ex); cCmd.cPM._cException = ex; } (new Logger()).Write(Logger.Level.debug2, "dispose: out"); if (new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds >= 20) // logging (new Logger()).WriteNotice("PixelMap: Command.ID.Dispose: execution time > 20ms: " + new TimeSpan(DateTime.Now.Ticks - nStartTick).TotalMilliseconds +"ms"); // logging break; #endregion } } } catch (Exception ex) { (new Logger()).WriteError(ex); } }
public unsafe void InitTasks() { bool doMidside = channels == 2 && eparams.do_midside; int channelCount = doMidside ? 2 * channels : channels; if (!inited) { cuda = new CUDA(true, InitializationFlags.None); cuda.CreateContext(0, CUCtxFlags.SchedAuto); using (Stream cubin = GetType().Assembly.GetManifestResourceStream(GetType(), "flacuda.cubin")) using (StreamReader sr = new StreamReader(cubin)) cuda.LoadModule(new ASCIIEncoding().GetBytes(sr.ReadToEnd())); //cuda.LoadModule(System.IO.Path.Combine(Environment.CurrentDirectory, "flacuda.cubin")); if (_IO == null) _IO = new FileStream(_path, FileMode.Create, FileAccess.Write, FileShare.Read); int header_size = flake_encode_init(); _IO.Write(header, 0, header_size); if (_IO.CanSeek) first_frame_offset = _IO.Position; task1 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); task2 = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); if (_settings.CPUThreads > 0) { cpu_tasks = new FlaCudaTask[_settings.CPUThreads]; for (int i = 0; i < cpu_tasks.Length; i++) cpu_tasks[i] = new FlaCudaTask(cuda, channelCount, channels, bits_per_sample, max_frame_size, _settings.DoVerify); } cudaWindow = cuda.Allocate((uint)sizeof(float) * FlaCudaWriter.MAX_BLOCKSIZE * 2 * lpc.MAX_LPC_WINDOWS); inited = true; } }