public void GMMUpdate(int gmm_N, CudaDeviceVariable <float> gmm, CudaDeviceVariable <byte> scratch_mem, int gmm_pitch, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> alpha, int width, int height) { dim3 grid = new dim3((width + 31) / 32, (height + 31) / 32, 1); dim3 block = new dim3(32, 4, 1); GMMReductionKernelCreateGmmFlags.BlockDimensions = block; GMMReductionKernelCreateGmmFlags.GridDimensions = grid; GMMReductionKernelNoCreateGmmFlags.BlockDimensions = block; GMMReductionKernelNoCreateGmmFlags.GridDimensions = grid; GMMReductionKernelCreateGmmFlags.Run((int)0, scratch_mem.DevicePointer + (grid.x * grid.y * 4), (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, alpha.DevicePointer, (int)alpha.Pitch, width, height, scratch_mem.DevicePointer); //GMMReductionKernel<4, true><<<grid, block>>>(0, &scratch_mem[grid.x * grid.y], gmm_pitch/4, image, image_pitch/4, alpha, alpha_pitch, width, height, (unsigned int*) scratch_mem); for (int i = 1; i < gmm_N; ++i) { GMMReductionKernelNoCreateGmmFlags.Run(i, scratch_mem.DevicePointer + (grid.x * grid.y * 4), (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, alpha.DevicePointer, (int)alpha.Pitch, width, height, scratch_mem.DevicePointer); //GMMReductionKernel<4, false><<<grid, block>>>(i, &scratch_mem[grid.x * grid.y], gmm_pitch/4, image, image_pitch/4, alpha, alpha_pitch, width, height, (unsigned int*) scratch_mem); } GMMFinalizeKernelInvertSigma.BlockDimensions = new dim3(32 * 4, 1, 1); GMMFinalizeKernelInvertSigma.GridDimensions = new dim3(gmm_N, 1, 1); GMMFinalizeKernelInvertSigma.Run(gmm.DevicePointer, scratch_mem.DevicePointer + (grid.x * grid.y * 4), (int)gmm_pitch / 4, grid.x * grid.y); //GMMFinalizeKernel<4, true><<<gmm_N, 32*4>>>(gmm, &scratch_mem[grid.x * grid.y], gmm_pitch/4, grid.x * grid.y); block.x = 32; block.y = 2; GMMcommonTerm.BlockDimensions = block; GMMcommonTerm.GridDimensions = new dim3(1, 1, 1); GMMcommonTerm.Run(gmm_N / 2, gmm.DevicePointer, (int)gmm_pitch / 4); //GMMcommonTerm<<<1, block>>>(gmm_N / 2, gmm, gmm_pitch/4); }
static double[] SumMatrixManagedCuda(double[][,] matrix) { int Z = matrix.Length; int Y = matrix[0].GetLength(0); int X = matrix[0].GetLength(1); var result = new double[Y * X]; var lm = ToLinearArray(matrix); int N = lm.Length; matrixSumCude.SetComputeSize((uint)X, (uint)Y); //matrixSumCude.BlockDimensions = 128; //matrixSumCude.GridDimensions = (N + 127) / 128; var da = cntxt.AllocateMemory(N * sizeof(double)); var db = cntxt.AllocateMemory(result.Length * sizeof(double)); cntxt.CopyToDevice(da, lm); cntxt.CopyToDevice(db, result); //CudaDeviceVariable<int> dA = a; //CudaDeviceVariable<int> dB = b; //CudaDeviceVariable<int> dC = new CudaDeviceVariable<int>(N); // Invoke kernel //kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ); matrixSumCude.Run(db, da, X, Y, Z); cntxt.CopyToHost <double>(result, db); return(result); }
public static void update_particles(CudaDeviceVariable <float> d_vx, CudaDeviceVariable <float> d_vy, CudaDeviceVariable <float> d_vz, int cnt) { float d_dt = 0; _gpuVelocity.BlockDimensions = new dim3(1, 1, 1); _gpuVelocity.GridDimensions = new dim3(cnt, 1, 1); _gpuVelocity.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer, d_k1x.DevicePointer, d_k1y.DevicePointer, d_k1z.DevicePointer, d_dt, d_vx.DevicePointer, d_vy.DevicePointer, d_vz.DevicePointer, d_k1x.DevicePointer, d_k1y.DevicePointer, d_k1z.DevicePointer, torus_size.DevicePointer, torus_res.DevicePointer, torus_d.DevicePointer); d_dt = _dt * (float)0.5; _gpuVelocity.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer, d_k1x.DevicePointer, d_k1y.DevicePointer, d_k1z.DevicePointer, d_dt, d_vx.DevicePointer, d_vy.DevicePointer, d_vz.DevicePointer, d_k2x.DevicePointer, d_k2y.DevicePointer, d_k2z.DevicePointer, torus_size.DevicePointer, torus_res.DevicePointer, torus_d.DevicePointer); _gpuVelocity.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer, d_k2x.DevicePointer, d_k2y.DevicePointer, d_k2z.DevicePointer, d_dt, d_vx.DevicePointer, d_vy.DevicePointer, d_vz.DevicePointer, d_k3x.DevicePointer, d_k3y.DevicePointer, d_k3z.DevicePointer, torus_size.DevicePointer, torus_res.DevicePointer, torus_d.DevicePointer); d_dt = _dt; _gpuVelocity.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer, d_k3x.DevicePointer, d_k3y.DevicePointer, d_k3z.DevicePointer, d_dt, d_vx.DevicePointer, d_vy.DevicePointer, d_vz.DevicePointer, d_k4x.DevicePointer, d_k4y.DevicePointer, d_k4z.DevicePointer, torus_size.DevicePointer, torus_res.DevicePointer, torus_d.DevicePointer); _gpuUpdate.BlockDimensions = new dim3(1, 1, 1); _gpuUpdate.GridDimensions = new dim3(cnt * 3, 1, 1); _gpuUpdate.Run(x.DevicePointer, y.DevicePointer, z.DevicePointer, d_k1x.DevicePointer, d_k1y.DevicePointer, d_k1z.DevicePointer, d_k2x.DevicePointer, d_k2y.DevicePointer, d_k2z.DevicePointer, d_k3x.DevicePointer, d_k3y.DevicePointer, d_k3z.DevicePointer, d_k4x.DevicePointer, d_k4y.DevicePointer, d_k4z.DevicePointer, d_dt); }
public void ApplyMatte(int mode, CudaPitchedDeviceVariable <uchar4> result, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> matte, int width, int height) { dim3 block = new dim3(32, 8, 1); dim3 grid = new dim3((width + 31) / 32, (height + 31) / 32, 1); switch (mode) { case 0: ApplyMatteKernelMode0.BlockDimensions = block; ApplyMatteKernelMode0.GridDimensions = grid; ApplyMatteKernelMode0.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height); //ApplyMatteKernel<0><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height); break; case 1: ApplyMatteKernelMode1.BlockDimensions = block; ApplyMatteKernelMode1.GridDimensions = grid; ApplyMatteKernelMode1.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height); //ApplyMatteKernel<1><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height); break; case 2: ApplyMatteKernelMode2.BlockDimensions = block; ApplyMatteKernelMode2.GridDimensions = grid; ApplyMatteKernelMode2.Run(result.DevicePointer, (int)result.Pitch / 4, image.DevicePointer, (int)image.Pitch / 4, matte.DevicePointer, (int)matte.Pitch, width, height); //ApplyMatteKernel<2><<<grid, block>>>(result, result_pitch/4, image, image_pitch/4, matte, matte_pitch, width, height); break; } }
//Test CUDA kernel for complex multiplication public void test(int N) { CudaContext ctx = new CudaContext(); CudaKernel kernel = ctx.LoadKernel("kernel.ptx", "ComplexMultCUDA"); kernel.GridDimensions = N; kernel.BlockDimensions = 1; double2[] a = new double2[N]; double2[] b = new double2[N]; double2[] c = new double2[N]; for (int i = 0; i < N; i++) { a[i].x = 1; a[i].y = 3; b[i].x = 2; b[i].y = 2; } CudaDeviceVariable <double2> d_a = null; CudaDeviceVariable <double2> d_b = null; try { d_a = a; d_b = b; } catch (Exception e) { Console.WriteLine("{0} Exception caught.", e); return; } kernel.Run(d_a.DevicePointer, d_b.DevicePointer, N); c = d_b; Console.WriteLine("C.last()={0}+i{1}", c.Last().x, c.Last().y); }
public void CalculateFitness(CudaDeviceVariable <byte> population, CudaDeviceVariable <float> fitness) { Profiler.Start("Calculate accuracy"); var deviceAccuracy = accuracyCalc.CalculateAccuracy(population); Profiler.Stop("Calculate accuracy"); float[] asdf = deviceAccuracy; Profiler.Start("Calculate vectorSizes"); countVectorsKernel.Calculate(population, vectorSizes); Profiler.Stop("Calculate vectorSizes"); int[] v = vectorSizes; Profiler.Start("Avrage VectorSizes"); float avrageVectorSize = Thrust.Avrage(vectorSizes); Profiler.Stop("Avrage VectorSizes"); Profiler.Start("Avrage accuracy"); float avrageAccuracy = Thrust.Avrage(deviceAccuracy); Profiler.Stop("Avrage accuracy"); Profiler.Start("fittness kernel"); fitnessKernel.Run( deviceAccuracy.DevicePointer, avrageAccuracy, vectorSizes.DevicePointer, avrageVectorSize, fitness.DevicePointer ); Profiler.Stop("fittness kernel"); }
/// <summary>Runs the kernel in synchronous mode.</summary> /// <param name="args">MyMemoryBlock arguments are automatically converted to device pointers.</param> public void RunSync(params object[] args) { CheckExecutionSetup(); ConvertMemoryBlocksToDevicePtrs(args); m_kernel.Run(args); }
public void CreateNewPopulation() { var tmp = populationGens; populationGens = populationGens2; populationGens2 = tmp; // var b = new FlattArray<byte>((byte[])populationGens, genLength).To2d(); Profiler.Start("Calculate fitness"); fitnessCalc.CalculateFitness(populationGens, deviceFitnes); Profiler.Stop("Calculate fitness"); Profiler.Start("sqauance"); Thrust.seaquance(fitnessIndeces); Profiler.Stop("sqauance"); Profiler.Start("sorting fitness"); Thrust.sort_by_key(deviceFitnes, fitnessIndeces); Profiler.Stop("sorting fitness"); Profiler.Start("performing genetics"); // var c = new FlattArray<byte>((byte[])populationGens, genLength).To2d(); performGeneticAlgorythm.Run( populationGens.DevicePointer, populationGens2.DevicePointer, deviceFitnes.DevicePointer, fitnessIndeces.DevicePointer ); //var a = new FlattArray<byte>((byte[])populationGens, genLength).To2d() ; Profiler.Stop("performing genetics"); }
public void CUDA_AddFloatArrays() { //Load Kernel image from resources Stream stream = new StreamReader(resName).BaseStream; if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd"); var threadsPerBlock = 1024; vectorAddKernel.BlockDimensions = threadsPerBlock; vectorAddKernel.GridDimensions = (Count + threadsPerBlock - 1) / threadsPerBlock; CudaStopWatch w = new CudaStopWatch(); w.Start(); vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, C.DevicePointer, Count); w.Stop(); Debug.Log(w.GetElapsedTime() / 1000.0f); Debug.Log($"{h_A[0]} + {h_B[0]} = {C[0]}"); Debug.Log($"{h_A[Count-1]} + {h_B[Count-1]} = {C[Count-1]}"); // Copy result from device memory to host memory // h_C contains the result in host memory // h_C = d_C; }
public CudaDeviceVariable <T> PrefixSumArray <T>(CudaDeviceVariable <T> input, int n) where T : struct { int arrayLength = n; int batchSize = n / arrayLength; if (!IsPowerOfTwo(n)) { throw new Exception("Input array length is not power of two."); } CudaDeviceVariable <T> output = new CudaDeviceVariable <T>(n); CudaDeviceVariable <T> buffer = new CudaDeviceVariable <T>(n); kernelScanExclusiveShared.BlockDimensions = threadBlockSize; kernelScanExclusiveShared.GridDimensions = (batchSize * arrayLength) / (4 * threadBlockSize); kernelScanExclusiveShared.Run(output.DevicePointer, input.DevicePointer, 4 * threadBlockSize); kernelScanExclusiveShared2.BlockDimensions = threadBlockSize; kernelScanExclusiveShared2.GridDimensions = (int)Math.Ceiling(((batchSize * arrayLength) / (4 * threadBlockSize)) / (double)threadBlockSize); kernelScanExclusiveShared2.Run(buffer.DevicePointer, output.DevicePointer, input.DevicePointer, (batchSize * arrayLength) / (4 * threadBlockSize), arrayLength / (4 * threadBlockSize)); kernelUniformUpdate.BlockDimensions = threadBlockSize; kernelUniformUpdate.GridDimensions = (batchSize * arrayLength) / (4 * threadBlockSize); kernelUniformUpdate.Run(output.DevicePointer, buffer.DevicePointer); buffer.Dispose(); return(output); }
// Testing managed CUDA call private static void RunCudaWithAKernel() { // C# Cuda code to call kernel int N = 50000; int deviceID = 0; CudaContext ctx = new CudaContext(deviceID); CudaKernel kernel = ctx.LoadKernel("kernel_x64.ptx", "VecAdd"); int numOfThreads = 256; kernel.GridDimensions = (N + numOfThreads - 1) / numOfThreads; kernel.BlockDimensions = numOfThreads; // allocate memory in host (not gpu) var h_A = InitWithData(N, numOfThreads * 4); var h_B = InitWithData(N, numOfThreads); // Allocate vectors in device memory and copy from host to device. CudaDeviceVariable <float> d_A = h_A; CudaDeviceVariable <float> d_B = h_B; CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(N); //Invoke kernel kernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N); Console.WriteLine("kernel has runeth"); //Copy from memory of device to host. float[] h_C = d_C; }
public override void MatrixBellmanErrorAndDerivative(Matrix predictedQValues, Matrix maxQHatValues, Matrix chosenActionIndices, Matrix currentRewards, Matrix error, Matrix errorDerivative, float discount, Matrix isLastEpisode, bool copyInputsFromCpuToGpu = false, bool copyOutputsFromGpuToCpu = false) { this.VerifyDimentionalityOfMatrices(predictedQValues, errorDerivative); this.VerifyColumnWithRowOfMatrices(predictedQValues, maxQHatValues); this.VerifyDimentionalityOfMatrices(maxQHatValues, chosenActionIndices, currentRewards); this.VerifyDimentionalityOfMatrices(currentRewards, error, isLastEpisode); if (copyInputsFromCpuToGpu) { predictedQValues.CopyToCuda(); maxQHatValues.CopyToCuda(); } CudaKernel kernel = InitializeGridsAndThreads("_Z26matrixBellmanErrorAndDerivPfS_S_S_S_S_fS_ii", errorDerivative); kernel.Run(predictedQValues.DeviceData.DevicePointer, maxQHatValues.DeviceData.DevicePointer, chosenActionIndices.DeviceData.DevicePointer, currentRewards.DeviceData.DevicePointer, error.DeviceData.DevicePointer, errorDerivative.DeviceData.DevicePointer, discount, isLastEpisode.DeviceData.DevicePointer, errorDerivative.Row, errorDerivative.Column); if (copyOutputsFromGpuToCpu) { error.CopyFromCuda(); errorDerivative.CopyFromCuda(); } }
public override void DqnStanfordEvaluation(Matrix predictedActionIndices, Matrix chosenActionIndices, Matrix currentRewards, Matrix matchPredictRewards, Matrix nonMatchPredictRewards, bool copyInputsFromCpuToGpu = false, bool copyOutputsFromGpuToCpu = false) { this.VerifyDimentionalityOfMatrices(predictedActionIndices, chosenActionIndices, currentRewards); this.VerifyDimentionalityOfMatrices(currentRewards, matchPredictRewards, nonMatchPredictRewards); if (copyInputsFromCpuToGpu) { predictedActionIndices.CopyToCuda(); chosenActionIndices.CopyToCuda(); currentRewards.CopyToCuda(); } CudaKernel kernel = InitializeGridsAndThreads("_Z21DqnStanfordEvaluationPfS_S_S_S_i", matchPredictRewards); kernel.Run(predictedActionIndices.DeviceData.DevicePointer, chosenActionIndices.DeviceData.DevicePointer, currentRewards.DeviceData.DevicePointer, matchPredictRewards.DeviceData.DevicePointer, nonMatchPredictRewards.DeviceData.DevicePointer, matchPredictRewards.Row); if (copyOutputsFromGpuToCpu) { matchPredictRewards.CopyToCuda(); nonMatchPredictRewards.CopyToCuda(); } }
public static void blaa() { int num = 10; //NewContext creation CudaContext cntxt = new CudaContext(); //Module loading from precompiled .ptx in a project output folder CUmodule cumodule = cntxt.LoadModule("kernel.ptx"); //_Z9addKernelPf - function name, can be found in *.ptx file CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt); //Create device array for data CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num); //Create arrays with data float[] vec1 = new float[num]; //Copy data to device vec1_device.CopyToDevice(vec1); //Set grid and block dimensions addWithCuda.GridDimensions = new dim3(8, 1, 1); addWithCuda.BlockDimensions = new dim3(512, 1, 1); //Run the kernel addWithCuda.Run( vec1_device.DevicePointer); //Copy data from device vec1_device.CopyToHost(vec1); }
public void Step(float stepSize) { // Load the next vector fields to device memory. LoadNextField(); _cudaDxMapper.MapAllResources(); CudaArray2D lastFlowMap = _cudaDxMapper[0].GetMappedArray2D(0, 0); // Advect from each member to each member. In each block, the same configuration is choosen. dim3 grid = new dim3((int)((float)_width / BLOCK_SIZE + 0.5f), (int)((float)_height / BLOCK_SIZE + 0.5f), _numMembers); // Advect a block in each member-member combination. dim3 threads = new dim3(BLOCK_SIZE, BLOCK_SIZE); _advectParticlesKernel.GridDimensions = grid; _advectParticlesKernel.BlockDimensions = threads; // (float* mapT1, const int width, const int height, const int numMembers, /*float timeScale, */ float stepSize, float minDensity, float invalid) _advectParticlesKernel.Run(_pongFlowMap.DevicePointer, _width, _height, _numMembers, stepSize, 0.000001f, _texInvalidValue); // Swap the Texture2D handles. CudaSurfObject surf = new CudaSurfObject(lastFlowMap); grid.z = 1; _copyMapDataKernel.GridDimensions = grid; _copyMapDataKernel.BlockDimensions = threads; _copyMapDataKernel.Run(surf.SurfObject, _pongFlowMap.DevicePointer, _width, _height); _cudaDxMapper.UnmapAllResources(); }
//private CudaKernel kernel1; //public Class1() //{ // //int deviceID = 0; // //CudaContext ctx = new CudaContext(deviceID); // //CUmodule cumodule = ctx.LoadModulePTX(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx"); // //kernel1 = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx); //} public static double[,] TestMatrix(double[][,] a) { using (CudaContext ctx = new CudaContext(0)) { CUmodule cumodule = ctx.LoadModule(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx"); var kernel = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx); int dimZ = a.Length; int dimX = a[0].GetLength(0); int dimY = a[0].GetLength(1); kernel.GridDimensions = new dim3(28, 28, 1); kernel.BlockDimensions = new dim3(1, 1, 1); //kernel.BlockDimensions = new dim3(dimX, dimY, 1); // Allocate vectors in device memory and copy vectors from host memory to device memory CudaDeviceVariable <double> dA = a.ToLinearArray(); //CudaDeviceVariable<double> dB = ToLinearArray(b); CudaDeviceVariable <double> dC = new CudaDeviceVariable <double>(dimX * dimY); // Invoke kernel kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ); // Copy result from device memory to host memory double[] c = dC; //ctx.FreeMemory(dC.DevicePointer); //ctx.FreeMemory(dA.DevicePointer); //ctx.Dispose(); return(ToMultyArray(c, dimX)); } }
public float BaseAccuracy() { var baseKernel = context.LoadKernel("kernels/VectorReduction.ptx", "calculateAccuracy"); dim3 gridDimension = new dim3() { x = (uint)(test.length / ThreadsPerBlock + 1), y = (uint)1, z = 1 }; baseKernel.GridDimensions = gridDimension; baseKernel.BlockDimensions = ThreadsPerBlock; baseKernel.SetConstantVariable("testVectorsCount", test.length); baseKernel.SetConstantVariable("teachingVectorsCount", teaching.length); baseKernel.SetConstantVariable("attributeCount", teaching.attributeCount); baseKernel.SetConstantVariable("genLength", teaching.length); var BaseRMSEKernel = context.LoadKernel("kernels/VectorReduction.ptx", "RMSE"); BaseRMSEKernel.GridDimensions = 1; BaseRMSEKernel.BlockDimensions = 1; BaseRMSEKernel.SetConstantVariable("testVectorsCount", test.length); byte[] gen = new byte[teaching.length]; for (int i = 0; i < gen.Length; i++) { gen[i] = 1; } using (CudaDeviceVariable <byte> deviceGen = gen) using (CudaDeviceVariable <float> baseAccuracy = new CudaDeviceVariable <float>(1)) { accuracyKernel.Run( test.classes.DevicePointer, teaching.classes.DevicePointer, deviceGen.DevicePointer, calculatedNeabours.DevicePointer, deviceAccuracy.DevicePointer ); BaseRMSEKernel.Run(baseAccuracy.DevicePointer); float[] host = baseAccuracy; return(host[0]); } }
public List <float> hypotesis(List <double> x, List <double> h, int N) { //int N = 2000000; string path = Path.GetDirectoryName(mv.plugins[0].filename); CudaContext ctx = new CudaContext(); CudaKernel kernel = ctx.LoadKernel(path + "\\kernel.ptx", "ComplexMultCUDA"); kernel.GridDimensions = (int)Math.Ceiling((double)(N + h.Count - 1) / 1024); kernel.BlockDimensions = 1024; double[] temp_y = new double[N + h.Count - 1]; double[] temp_h = new double[N + h.Count - 1]; double[] temp_x = new double[N + h.Count - 1]; double2[] temp_x2 = new double2[N + h.Count - 1]; h.ToArray().CopyTo(temp_h, 0); x.ToArray().CopyTo(temp_x, 0); CudaDeviceVariable <double> d_x = null; CudaDeviceVariable <double2> d_X = new CudaDeviceVariable <double2>(N + h.Count - 1); CudaDeviceVariable <double> d_h = new CudaDeviceVariable <double>(N + h.Count - 1); CudaDeviceVariable <double2> d_H = new CudaDeviceVariable <double2>(N + h.Count - 1); CudaDeviceVariable <double> d_y = new CudaDeviceVariable <double>(N + h.Count - 1); CudaFFTPlan1D planForward = new CudaFFTPlan1D(N + h.Count - 1, cufftType.D2Z, 1); CudaFFTPlan1D planInverse = new CudaFFTPlan1D(N + h.Count - 1, cufftType.Z2D, 1); try { d_h = temp_h; planForward.Exec(d_h.DevicePointer, d_H.DevicePointer, TransformDirection.Forward); } catch (Exception exp) { mainView.log(exp, "CUDA error: Impulse response FFT", this); return(null); } try { d_x = temp_x; planForward.Exec(d_x.DevicePointer, d_X.DevicePointer); kernel.Run(d_H.DevicePointer, d_X.DevicePointer, N + h.Count - 1); planInverse.Exec(d_X.DevicePointer, d_y.DevicePointer); } catch (Exception exp) { mainView.log(exp, "Cuda error: kernel run cuda error", this); } temp_y = d_y; return(Array.ConvertAll <double, float>(temp_y, d => (float)d).ToList().GetRange(500, x.Count)); }
private void addForces(CudaPitchedDeviceVariable <float2> v, int dx, int dy, int spx, int spy, float fx, float fy, int r, SizeT tPitch) { dim3 tids = new dim3((uint)(2 * r + 1), (uint)(2 * r + 1), 1); addForces_k.GridDimensions = new dim3(1); addForces_k.BlockDimensions = tids; addForces_k.Run(v.DevicePointer, dx, dy, spx, spy, fx, fy, r, tPitch); }
static void Main(string[] args) { InitKernels(); int n = 0x1000000; float[] xx = new float[n]; float[] yy = new float[n]; float[] zz = new float[n]; int[] tt = new int[n]; float L = 512; float Lx = L, Ly = L, Lz = L; CudaDeviceVariable <float> gpu_lengths = new float[] { Lx, Ly, Lz }; for (int i = 0; i < n; i++) { xx[i] = ((float)(i & 0x0000FF) / (float)0x000100 - 0.5F) * Lx; yy[i] = ((float)(i & 0x00FF00) / (float)0x010000 - 0.5F) * Ly; zz[i] = ((float)(i & 0xFF0000) / (float)0x1000000 - 0.5F) * Lz; tt[i] = 1; } SimulationMolecules simulationMolecules = new SimulationMolecules(xx, yy, zz, tt); for (int i = 0; i < 10; i++) { Console.WriteLine($"{i} = ({simulationMolecules.x[i]},{simulationMolecules.y[i]},{simulationMolecules.z[i]})"); } int ii = 0x808080; Console.WriteLine($"{ii} = ({simulationMolecules.x[ii]},{simulationMolecules.y[ii]},{simulationMolecules.z[ii]})"); Interactions interactions = new Interactions(); interactions.SetLJParameters(1, 1, 1.5F, 1.5F); float[] cachedEnergies = new float[n / THREADS_PER_BLOCK + 1]; CudaDeviceVariable <float> gpu_energies = cachedEnergies; energyOfExistingMolecule.Run(n, simulationMolecules.gpu_x.DevicePointer, simulationMolecules.gpu_y.DevicePointer, simulationMolecules.gpu_z.DevicePointer, simulationMolecules.gpu_types.DevicePointer, interactions.SigmaPointer(), interactions.EpsilonPointer(), gpu_lengths.DevicePointer, 2.5F, 0x808080, gpu_energies.DevicePointer ); gpu_energies.CopyToHost(cachedEnergies); double e = cachedEnergies.Sum(); Console.WriteLine($"Energy = {e}"); Console.Read(); }
public void DownscaleTrimap(CudaPitchedDeviceVariable <byte> small_image, int small_width, int small_height, CudaPitchedDeviceVariable <byte> image, int width, int height) { dim3 grid = new dim3((width + 63) / 64, (height + 63) / 64, 1); dim3 block = new dim3(32, 8, 1); downscaleKernel2.BlockDimensions = block; downscaleKernel2.GridDimensions = grid; downscaleKernel2.Run(small_image.DevicePointer, (int)small_image.Pitch, small_width, small_height, image.DevicePointer, (int)image.Pitch, width, height); //downscaleKernel<<<grid, block>>>(small_image, small_pitch, small_width, small_height, image, pitch, width, height, maxfilter_functor()); }
private void diffuseProject(CudaDeviceVariable <float2> vx, CudaDeviceVariable <float2> vy, int dx, int dy, float dt, float visc, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); diffuseProject_k.GridDimensions = grid; diffuseProject_k.BlockDimensions = tids; diffuseProject_k.Run(vx.DevicePointer, vy.DevicePointer, dx, dy, dt, visc, TILEY / TIDSY); }
public void DataTerm(CudaPitchedDeviceVariable <int> terminals, int gmmN, CudaDeviceVariable <float> gmm, int gmm_pitch, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> trimap, int width, int height) { dim3 block = new dim3(32, 8, 1); dim3 grid = new dim3((int)((width + block.x - 1) / block.x), (int)((height + block.y - 1) / block.y), 1); DataTermKernel.BlockDimensions = block; DataTermKernel.GridDimensions = grid; DataTermKernel.Run(terminals.DevicePointer, (int)terminals.Pitch / 4, gmmN, gmm.DevicePointer, (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, trimap.DevicePointer, (int)trimap.Pitch, width, height); //DataTermKernel<<<grid, block>>>(terminals, terminal_pitch/4, gmmN, gmm, gmm_pitch/4, image, image_pitch/4, trimap, trimap_pitch, width, height); }
public void GMMAssign(int gmmN, CudaDeviceVariable <float> gmm, int gmm_pitch, CudaPitchedDeviceVariable <uchar4> image, CudaPitchedDeviceVariable <byte> alpha, int width, int height) { dim3 block = new dim3(32, 16, 1); dim3 grid = new dim3((int)((width + block.x - 1) / block.x), (int)((height + block.y - 1) / block.y), 1); GMMAssignKernel.BlockDimensions = block; GMMAssignKernel.GridDimensions = grid; GMMAssignKernel.Run(gmmN, gmm.DevicePointer, (int)gmm_pitch / 4, image.DevicePointer, (int)image.Pitch / 4, alpha.DevicePointer, (int)alpha.Pitch, width, height); //GMMAssignKernel<<<grid, block>>>(gmmN, gmm, gmm_pitch/4, image, image_pitch/4, alpha, alpha_pitch, width, height); }
private void advectParticles(CudaDeviceVariable <vertex> p, CudaPitchedDeviceVariable <float2> v, int dx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); advectParticles_k.GridDimensions = grid; advectParticles_k.BlockDimensions = tids; advectParticles_k.Run(p.DevicePointer, v.DevicePointer, dx, dy, dt, TILEY / TIDSY, tPitch); }
private void updateVelocity(CudaPitchedDeviceVariable <float2> v, CudaDeviceVariable <float2> vx, CudaDeviceVariable <float2> vy, int dx, int pdx, int dy, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); updateVelocity_k.GridDimensions = grid; updateVelocity_k.BlockDimensions = tids; updateVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, TILEY / TIDSY, tPitch); }
private void BlurGpu(int iterations) { for (int i = 0; i < BlurIterations; i++) { kernel.Run(input.DevicePointer, output.DevicePointer, Width, Height); } // Copy result from device memory to host memory // h_C contains the result in host memory //float[] copyOutput = output; image = output; }
public void Run(params object[] args) { for (int i = 0; i < args.Length; i++) { if (args[i] is MyAbstractMemoryBlock) { args[i] = (args[i] as MyAbstractMemoryBlock).GetDevicePtr(m_GPU); } } m_kernel.Run(args); }
//////////////////////////////////////////////////////////////////////////////// // Occupancy-based launch configurator // // The launch configurator, cudaOccupancyMaxPotentialBlockSize and // cudaOccupancyMaxPotentialBlockSizeVariableSMem, suggests a block // size that achieves the best theoretical occupancy. It also returns // the minimum number of blocks needed to achieve the occupancy on the // whole device. // // This launch configurator is purely occupancy-based. It doesn't // translate directly to performance, but the suggestion should // nevertheless be a good starting point for further optimizations. // // This function configures the launch based on the "automatic" // argument, records the runtime, and reports occupancy and runtime. //////////////////////////////////////////////////////////////////////////////// static int launchConfig(CudaDeviceVariable <int> array, int arrayCount, bool automatic) { int blockSize = 0; int minGridSize = 0; int gridSize; SizeT dynamicSMemUsage = 0; float elapsedTime; double potentialOccupancy; CudaOccupancy.cudaOccDeviceState state = new CudaOccupancy.cudaOccDeviceState(); state.cacheConfig = CudaOccupancy.cudaOccCacheConfig.PreferNone; if (automatic) { CudaOccupancy.cudaOccMaxPotentialOccupancyBlockSize(ref minGridSize, ref blockSize, new CudaOccupancy.cudaOccDeviceProp(0), new CudaOccupancy.cudaOccFuncAttributes(kernel), state, dynamicSMemUsage); Console.WriteLine("Suggested block size: {0}", blockSize); Console.WriteLine("Minimum grid size for maximum occupancy: {0}", minGridSize); } else { // This block size is too small. Given limited number of // active blocks per multiprocessor, the number of active // threads will be limited, and thus unable to achieve maximum // occupancy. // blockSize = manualBlockSize; } // Round up // gridSize = (arrayCount + blockSize - 1) / blockSize; // Launch and profile // kernel.GridDimensions = gridSize; kernel.BlockDimensions = blockSize; elapsedTime = kernel.Run(array.DevicePointer, arrayCount); // Calculate occupancy // potentialOccupancy = reportPotentialOccupancy(blockSize, dynamicSMemUsage); Console.WriteLine("Potential occupancy: {0}%", potentialOccupancy * 100); // Report elapsed time // Console.WriteLine("Elapsed time: {0}ms", elapsedTime * 100); return(0); }
internal long ParallelFor(long value) { result.CopyToDevice(new long[] { 0 }); parallelFor.Run(new object[] { value, result.DevicePointer }); long[] returned = new long[1]; result.CopyToHost(returned); return(returned[0]); }
static void Main(string[] args) { // NOTE: You need to change this location to match your own machine. Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment."); Console.ResetColor(); System.Threading.Thread.Sleep(500); string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx"; CudaContext ctx = new CudaContext(); CUmodule module = ctx.LoadModule(path); kernel = new CudaKernel("kernel", module, ctx); // This tells the kernel to allocate a lot of threads for the Gpu. kernel.BlockDimensions = THREADS_PER_BLOCK; kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; ; // Now let's load the kernel! // Create the topology. int[] topology = new int[] { 1, 200, 200, 100, 1 }; int height = topology.Length; int width = 0; for (int i = 0; i < topology.Length; i++) if (width < topology[i]) width = topology[i]; // Launch! float[] res = new float[height * width]; for (int i = 0; i < 10; i++) { float[] matrix = new float[height * width]; float[] weights = new float[height * width]; Random rand = new Random(424242); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0; weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0; } } // Load the kernel with some variables. CudaDeviceVariable<int> cuda_topology = topology; CudaDeviceVariable<float> cuda_membank = matrix; CudaDeviceVariable<float> cuda_weights = weights; Stopwatch sw = new Stopwatch(); sw.Start(); kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width); cuda_membank.CopyToHost(res); sw.Stop(); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]); Console.ResetColor(); } Console.ReadKey(); }
private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth) { int count = width * height * depth; int widthD = width - 1; int heightD = height - 1; int depthD = depth - 1; int countDecremented = widthD * heightD * depthD; dim3 blockDimensions = new dim3(8, 8, 8); dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0)); dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0)); CUDANoiseCube noiseCube = new CUDANoiseCube(); CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16); CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray); CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count); kernelPositionWeight.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions); kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth); kernelNormalAmbient.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions); kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount); int nearestW = NearestPowerOfTwo(widthD); int nearestH = NearestPowerOfTwo(heightD); int nearestD = NearestPowerOfTwo(depthD); int nearestCount = nearestW * nearestH * nearestD; CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount); trisCountDevice.Memset(0); CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented); kernelMarchingCubesCases.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented); kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD); CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount); int lastTrisCount = 0; trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int)); int lastPrefixSum = 0; prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int)); int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3; if (totalVerticesCount > 0) { if (container.Geometry != null) container.Geometry.Dispose(); container.VertexCount = totalVerticesCount; container.Geometry = new Buffer(graphicsDevice, new BufferDescription() { BindFlags = BindFlags.VertexBuffer, CpuAccessFlags = CpuAccessFlags.None, OptionFlags = ResourceOptionFlags.None, SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount, Usage = ResourceUsage.Default }); CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None); kernelMarchingCubesVertices.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented); directResource.Map(); kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD); directResource.UnMap(); directResource.Dispose(); } else { container.VertexCount = 0; if (container.Geometry != null) container.Geometry.Dispose(); } noiseCube.Dispose(); prefixSumsDev.Dispose(); trisCountDevice.Dispose(); offsetsDev.Dispose(); noiseArray.Dispose(); noiseTexture.Dispose(); voxelsDev.Dispose(); }