private void InitCudaModule() { cuda = gpuKernel.cuda; //cuda = new CUDA(0, true); //cuCtx = cuda.CreateContext(0, CUCtxFlags.MapHost); //cuda.SetCurrentContext(cuCtx); string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName); if (!File.Exists(modluePath)) { throw new ArgumentException("Failed access to cuda module" + modluePath); } cuModule = cuda.LoadModule(modluePath); cuFuncFindMaxIMinJ = cuda.GetModuleFunction(funcFindMaxIMinJ); cuFuncUpdateG = cuda.GetModuleFunction(funcUpdateGFunc); }
public static extern CUResult cuParamSetv(CUfunction hfunc, int offset, ref double ptr, uint numbytes);
public static extern CUResult cuParamSetv(CUfunction hfunc, int offset, ref long value, uint numbytes);
public static extern CUResult cuParamSetTexRef(CUfunction hfunc, int texunit, CUtexref hTexRef);
public static extern CUResult cuFuncGetAttribute(ref int pi, CUFunctionAttribute attrib, CUfunction hfunc);
static void Main(string[] args) { // Init and select 1st device. CUDA cuda = new CUDA(0, true); // load module //cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, "simpleCUFFT.ptx")); CUfunction func = new CUfunction();// cuda.GetModuleFunction("ComplexPointwiseMulAndScale"); // The filter size is assumed to be a number smaller than the signal size const int SIGNAL_SIZE = 50; const int FILTER_KERNEL_SIZE = 11; // Allocate host memory for the signal Float2[] h_signal = new Float2[SIGNAL_SIZE]; // Initalize the memory for the signal Random r = new Random(); for (int i = 0; i < SIGNAL_SIZE; ++i) { h_signal[i].x = r.Next() / (float)int.MaxValue; h_signal[i].y = 0; } // Allocate host memory for the filter Float2[] h_filter_kernel = new Float2[FILTER_KERNEL_SIZE]; // Initalize the memory for the filter for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { h_filter_kernel[i].x = r.Next() / (float)int.MaxValue; h_filter_kernel[i].y = 0; } // Pad signal and filter kernel Float2[] h_padded_signal; Float2[] h_padded_filter_kernel; int new_size = PadData(h_signal, out h_padded_signal, SIGNAL_SIZE, h_filter_kernel, out h_padded_filter_kernel, FILTER_KERNEL_SIZE); // Allocate device memory for signal // Copy host memory to device CUdeviceptr d_signal = cuda.CopyHostToDevice<Float2>(h_padded_signal); // Allocate device memory for filter kernel // Copy host memory to device CUdeviceptr d_filter_kernel = cuda.CopyHostToDevice<Float2>(h_padded_filter_kernel); // CUFFT plan CUFFT fft = new CUFFT(cuda); cufftHandle handle = new cufftHandle(); CUFFTResult fftres = CUFFTDriver.cufftPlan1d(ref handle, new_size, CUFFTType.C2C, 1); //fft.Plan1D(new_size, CUFFTType.C2C, 1); return; // Transform signal and kernel fft.ExecuteComplexToComplex(d_signal, d_signal, CUFFTDirection.Forward); fft.ExecuteComplexToComplex(d_filter_kernel, d_filter_kernel, CUFFTDirection.Forward); // Multiply the coefficients together and normalize the result // ComplexPointwiseMulAndScale<<<32, 256>>>(d_signal, d_filter_kernel, new_size, 1.0f / new_size); cuda.SetFunctionBlockShape(func, 256, 1, 1); cuda.SetParameter(func, 0, (uint)d_signal.Pointer); cuda.SetParameter(func, IntPtr.Size, (uint)d_filter_kernel.Pointer); cuda.SetParameter(func, IntPtr.Size * 2, (uint)new_size); cuda.SetParameter(func, IntPtr.Size * 2 + 4, 1.0f / new_size); cuda.SetParameterSize(func, (uint)(IntPtr.Size * 2 + 8)); cuda.Launch(func, 32, 1); // Transform signal back fft.ExecuteComplexToComplex(d_signal, d_signal, CUFFTDirection.Inverse); // Copy device memory to host Float2[] h_convolved_signal = h_padded_signal; cuda.CopyDeviceToHost<Float2>(d_signal, h_convolved_signal); // Allocate host memory for the convolution result Float2[] h_convolved_signal_ref = new Float2[SIGNAL_SIZE]; // Convolve on the host Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); // check result bool res = cutCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 2 * SIGNAL_SIZE, 1e-5f); Console.WriteLine("Test {0}", (true == res) ? "PASSED" : "FAILED"); //Destroy CUFFT context fft.Destroy(); // cleanup memory cuda.Free(d_signal); cuda.Free(d_filter_kernel); }
public override void DoLayout() { CUdeviceptr p1 = new CUdeviceptr(); CUDADriver.cuMemAlloc(ref p1, 1 <<10); byte[] b = new byte[1<<10]; CUDADriver.cuMemcpyHtoD(p1, b, (uint) b.Length); CUfunction func = new CUfunction(); CUResult res; int nnodes = (int) Network.VertexCount*2; int blocks = 32; if (nnodes < 1024*blocks) nnodes = 1024*blocks; while ((nnodes & (prop.SIMDWidth-1)) != 0) nnodes++; nnodes--; //float dtime = 0.025f; float dthf = dtime * 0.5f; //float epssq = 0.05f * 0.05f; //float itolsq = 1.0f / (0.5f * 0.5f); CUDADriver.cuModuleGetFunction(ref func, mod, "dummy"); // Float4[] data = new Float4[100]; CUdeviceptr ptr = new CUdeviceptr(); //CUDADriver.cuMemAlloc(ref ptr, (uint) 100 * System.Runtime.InteropServices.Marshal.SizeOf(Float4)); CUDADriver.cuParamSeti(func, 0, (uint) ptr.Pointer); CUDADriver.cuParamSetSize(func, 4); res = CUDADriver.cuLaunch(func); if(res != CUResult.Success) Logger.AddMessage(LogEntryType.Warning, "CUDA Error in dummy function: " +res.ToString()); // InitializationKernel<<<1, 1>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "InitializationKernel"); res = CUDADriver.cuLaunch(func); if(res != CUResult.Success) Logger.AddMessage(LogEntryType.Warning, "CUDA Error in InitializationKernel: " +res.ToString()); // BoundingBoxKernel<<<blocks * FACTOR1, THREADS1>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "BoundingBoxKernel: "+res.ToString()); CUDADriver.cuLaunch(func); if(res != CUResult.Success) Logger.AddMessage(LogEntryType.Warning, "CUDA Error in BoundingBoxKernel: "+res.ToString()); // TreeBuildingKernel<<<blocks * FACTOR2, THREADS2>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "TreeBuildingKernel: "+res.ToString()); CUDADriver.cuLaunch(func); if(res != CUResult.Success) Logger.AddMessage(LogEntryType.Warning, "CUDA Error in TreeBuildingKernel: "+res.ToString()); // SummarizationKernel<<<blocks * FACTOR3, THREADS3>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "SummarizationKernel: "+res.ToString()); CUDADriver.cuLaunch(func); if(res != CUResult.Success) Logger.AddMessage(LogEntryType.Warning, "CUDA Error in SummarizationKernel: "+res.ToString()); // ForceCalculationKernel<<<blocks * FACTOR5, THREADS5>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "ForceCalculationKernel: "+res.ToString()); CUDADriver.cuLaunch(func); if(res != CUResult.Success) Logger.AddMessage(LogEntryType.Warning, "CUDA Error in ForceCalculationKernel: "+res.ToString()); // IntegrationKernel<<<blocks * FACTOR6, THREADS6>>>(); CUDADriver.cuModuleGetFunction(ref func, mod, "IntegrationKernel"); CUDADriver.cuLaunch(func); if(res != CUResult.Success) Logger.AddMessage(LogEntryType.Warning, "CUDA Error in IntegrationKernel: "+res.ToString()); }
private void InitCudaModule() { string modluePath = Path.Combine(Environment.CurrentDirectory, cudaModuleName); if (!File.Exists(modluePath)) throw new ArgumentException("Failed access to cuda module" + modluePath); cuModule = cuda.LoadModule(modluePath); cuFuncDense = cuda.GetModuleFunction(funcName); }
public static extern CUResult cuLaunchGridAsync(CUfunction f, int grid_width, int grid_height, CUstream hStream);
public static extern CUResult cuLaunchGrid(CUfunction f, int grid_width, int grid_height);
public static extern CUResult cuLaunch(CUfunction f);
public static extern CUResult cuFuncSetSharedSize(CUfunction hfunc, uint bytes);
public static extern CUResult cuFuncSetCacheConfig(CUfunction hfunc, CUFunctionCache config);
public static extern CUResult cuFuncSetBlockShape(CUfunction hfunc, int x, int y, int z);
public static extern CUResult cuParamSetv(CUfunction hfunc, int offset, [In] Short1[] ptr, uint numbytes);
//private double ComputeObj(float[] w, float[] alpha, Problem<SparseVec> sub_prob, float[] diag) //{ // double v = 0, v1=0; // int nSV = 0; // for (int i = 0; i < w.Length; i++) // { // v += w[i] * w[i]; // v1 += 0.5*w[i] * w[i]; // } // for (int i = 0; i < alpha.Length; i++) // { // sbyte y_i = (sbyte)sub_prob.Y[i]; // //original line // //v += alpha[i] * (alpha[i] * diag[GETI(y_i, i)] - 2); // v += alpha[i] * (alpha[i] * diag[y_i + 1] - 2); // v1 += 0.5* alpha[i] * (alpha[i] * diag[y_i + 1] - 2); // if (alpha[i] > 0) ++nSV; // } // v = v / 2; // // Debug.WriteLine("Objective value = {0}", v); // // Debug.WriteLine("nSV = {0}", nSV); // return v; //} protected void InitCudaModule() { cuda = new CUDA(0, true); cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName)); cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName); cuFuncSolver = cuda.GetModuleFunction(cudaSolveL2SVM); cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW); }
public static extern CUResult cuModuleGetFunction(ref CUfunction hfunc, CUmodule hmod, string name);
public void Initialize() { float softeningSquared = 0.00125f; Random random = new Random(); m_CUDA.LoadModule(m_nbody_kernel); m_IntegrateBodies = m_CUDA.GetModuleFunction("IntegrateBodies"); m_SofteningSquared = m_CUDA.GetModuleGlobal("softeningSquared"); h_Pos = new Float4[2][] { new Float4[m_NumBodies], new Float4[m_NumBodies] }; h_Vel = new Float4[2][] { new Float4[m_NumBodies], new Float4[m_NumBodies] }; d_Pos = new CUdeviceptr[2] { m_CUDA.Allocate<Float4>(HostOldPos), m_CUDA.Allocate<Float4>(HostNewPos) }; d_Vel = new CUdeviceptr[2] { m_CUDA.Allocate<Float4>(HostOldVel), m_CUDA.Allocate<Float4>(HostNewVel) }; float scale = 3.0f; float vscale = scale * 1.0f; for (int i = 0; i < HostOldPos.Length; i++) { recalc: HostOldPos[i].x = (float)(random.NextDouble() * 2 - 1.0); HostOldPos[i].y = (float)(random.NextDouble() * 2 - 1.0); HostOldPos[i].z = (float)(random.NextDouble() * 2 - 1.0); HostOldPos[i].w = 1.0f; if (dot(HostOldPos[i], HostOldPos[i]) > 1.0f) goto recalc; HostOldPos[i].x *= scale; HostOldPos[i].y *= scale; HostOldPos[i].z *= scale; } for (int i = 0; i < HostOldVel.Length; i++) { recalc: HostOldVel[i].x = (float)(random.NextDouble() * 2 - 1.0); HostOldVel[i].y = (float)(random.NextDouble() * 2 - 1.0); HostOldVel[i].z = (float)(random.NextDouble() * 2 - 1.0); HostOldVel[i].w = 1.0f; if (dot(HostOldVel[i], HostOldVel[i]) > 1.0f) goto recalc; HostOldPos[i].x *= vscale; HostOldPos[i].y *= vscale; HostOldPos[i].z *= vscale; } m_CUDA.CopyHostToDevice<Float4>(DeviceOldPos, HostOldPos); m_CUDA.CopyHostToDevice<Float4>(DeviceOldVel, HostOldVel); m_CUDA.CopyHostToDevice<float>(m_SofteningSquared, new float[] { softeningSquared }); }
public static extern CUResult cuParamSetf(CUfunction hfunc, int offset, float value);
public static extern CUResult cuParamSeti(CUfunction hfunc, int offset, uint value);
public static extern CUResult cuParamSetSize(CUfunction hfunc, uint numbytes);
private void InitCudaModule() { cuda = new CUDA(0, true); cuModule = cuda.LoadModule(Path.Combine(Environment.CurrentDirectory, cudaModuleName)); cuFuncDotProd = cuda.GetModuleFunction(cudaProductKernelName); cuFuncGradFinalize = cuda.GetModuleFunction(cudaGradFinalizeName); cuFuncComputeBBstep = cuda.GetModuleFunction(cudaComputeBBStepName); cuFuncObjSquareW = cuda.GetModuleFunction(cudaObjWName); cuFuncObjSquareAlpha = cuda.GetModuleFunction(cudaObjAlphaName); cuFuncUpdateW = cuda.GetModuleFunction(cudaUpdateW); cuFuncUpdateAlpha = cuda.GetModuleFunction(cudaUpdateAlphaName); cuFuncMaxNorm = cuda.GetModuleFunction(cudaMaxNormName); }
unsafe public FlaCudaTask(CUDA _cuda, int channelCount, int channels, uint bits_per_sample, int max_frame_size, bool do_verify) { cuda = _cuda; residualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * (lpc.MAX_LPC_ORDER * lpc.MAX_LPC_WINDOWS + 8) * FlaCudaWriter.maxFrames; bestResidualTasksLen = sizeof(FlaCudaSubframeTask) * channelCount * FlaCudaWriter.maxFrames; samplesBufferLen = sizeof(int) * FlaCudaWriter.MAX_BLOCKSIZE * channelCount; int partitionsLen = sizeof(int) * (30 << 8) * channelCount * FlaCudaWriter.maxFrames; int riceParamsLen = sizeof(int) * (4 << 8) * channelCount * FlaCudaWriter.maxFrames; int lpcDataLen = sizeof(float) * 32 * 33 * lpc.MAX_LPC_WINDOWS * channelCount * FlaCudaWriter.maxFrames; cudaSamplesBytes = cuda.Allocate((uint)samplesBufferLen / 2); cudaSamples = cuda.Allocate((uint)samplesBufferLen); cudaResidual = cuda.Allocate((uint)samplesBufferLen); cudaLPCData = cuda.Allocate((uint)lpcDataLen); cudaPartitions = cuda.Allocate((uint)partitionsLen); cudaRiceParams = cuda.Allocate((uint)riceParamsLen); cudaBestRiceParams = cuda.Allocate((uint)riceParamsLen / 4); cudaAutocorOutput = cuda.Allocate((uint)(sizeof(float) * channelCount * lpc.MAX_LPC_WINDOWS * (lpc.MAX_LPC_ORDER + 1) * (FlaCudaWriter.maxAutocorParts + FlaCudaWriter.maxFrames))); cudaResidualTasks = cuda.Allocate((uint)residualTasksLen); cudaBestResidualTasks = cuda.Allocate((uint)bestResidualTasksLen); cudaResidualOutput = cuda.Allocate((uint)(sizeof(int) * channelCount * (lpc.MAX_LPC_WINDOWS * lpc.MAX_LPC_ORDER + 8) * 64 /*FlaCudaWriter.maxResidualParts*/ * FlaCudaWriter.maxFrames)); CUResult cuErr = CUResult.Success; if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref samplesBytesPtr, (uint)samplesBufferLen/2); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualBufferPtr, (uint)samplesBufferLen); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref bestRiceParamsPtr, (uint)riceParamsLen / 4); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref residualTasksPtr, (uint)residualTasksLen); if (cuErr == CUResult.Success) cuErr = CUDADriver.cuMemAllocHost(ref bestResidualTasksPtr, (uint)bestResidualTasksLen); if (cuErr != CUResult.Success) { if (samplesBytesPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(samplesBytesPtr); samplesBytesPtr = IntPtr.Zero; if (residualBufferPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualBufferPtr); residualBufferPtr = IntPtr.Zero; if (bestRiceParamsPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestRiceParamsPtr); bestRiceParamsPtr = IntPtr.Zero; if (residualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(residualTasksPtr); residualTasksPtr = IntPtr.Zero; if (bestResidualTasksPtr != IntPtr.Zero) CUDADriver.cuMemFreeHost(bestResidualTasksPtr); bestResidualTasksPtr = IntPtr.Zero; throw new CUDAException(cuErr); } cudaComputeAutocor = cuda.GetModuleFunction("cudaComputeAutocor"); cudaStereoDecorr = cuda.GetModuleFunction("cudaStereoDecorr"); cudaChannelDecorr = cuda.GetModuleFunction("cudaChannelDecorr"); cudaChannelDecorr2 = cuda.GetModuleFunction("cudaChannelDecorr2"); cudaFindWastedBits = cuda.GetModuleFunction("cudaFindWastedBits"); cudaComputeLPC = cuda.GetModuleFunction("cudaComputeLPC"); cudaQuantizeLPC = cuda.GetModuleFunction("cudaQuantizeLPC"); cudaComputeLPCLattice = cuda.GetModuleFunction("cudaComputeLPCLattice"); cudaEstimateResidual = cuda.GetModuleFunction("cudaEstimateResidual"); cudaEstimateResidual8 = cuda.GetModuleFunction("cudaEstimateResidual8"); cudaEstimateResidual12 = cuda.GetModuleFunction("cudaEstimateResidual12"); cudaEstimateResidual1 = cuda.GetModuleFunction("cudaEstimateResidual1"); cudaChooseBestMethod = cuda.GetModuleFunction("cudaChooseBestMethod"); cudaCopyBestMethod = cuda.GetModuleFunction("cudaCopyBestMethod"); cudaCopyBestMethodStereo = cuda.GetModuleFunction("cudaCopyBestMethodStereo"); cudaEncodeResidual = cuda.GetModuleFunction("cudaEncodeResidual"); cudaCalcPartition = cuda.GetModuleFunction("cudaCalcPartition"); cudaCalcPartition16 = cuda.GetModuleFunction("cudaCalcPartition16"); cudaCalcLargePartition = cuda.GetModuleFunction("cudaCalcLargePartition"); cudaSumPartition = cuda.GetModuleFunction("cudaSumPartition"); cudaFindRiceParameter = cuda.GetModuleFunction("cudaFindRiceParameter"); cudaFindPartitionOrder = cuda.GetModuleFunction("cudaFindPartitionOrder"); stream = cuda.CreateStream(); samplesBuffer = new int[FlaCudaWriter.MAX_BLOCKSIZE * channelCount]; outputBuffer = new byte[max_frame_size * FlaCudaWriter.maxFrames + 1]; frame = new FlacFrame(channelCount); frame.writer = new BitWriter(outputBuffer, 0, outputBuffer.Length); if (do_verify) { verify = new FlakeReader(new AudioPCMConfig((int)bits_per_sample, channels, 44100)); verify.DoCRC = false; } }