private void initGLAndCuda() { //Create render target control m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default); m_renderControl.Dock = DockStyle.Fill; m_renderControl.BackColor = Color.White; m_renderControl.BorderStyle = BorderStyle.FixedSingle; m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown); m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove); m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown); m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged); panel1.Controls.Add(m_renderControl); Console.WriteLine(" OpenGL device is Available"); int deviceID = CudaContext.GetMaxGflopsDeviceId(); ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync); string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount); Console.WriteLine(console); CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx); hvfield = new cData[DS]; dvfield = new CudaPitchedDeviceVariable<cData>(DIM, DIM); tPitch = dvfield.Pitch; dvfield.CopyToDevice(hvfield); vxfield = new CudaDeviceVariable<cData>(DS); vyfield = new CudaDeviceVariable<cData>(DS); // Create particle array particles = new cData[DS]; initParticles(particles, DIM, DIM); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding); planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding); GL.GenBuffers(1, out vbo); GL.BindBuffer(BufferTarget.ArrayBuffer, vbo); GL.BufferData<cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw); int bsize; GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize); if (bsize != DS * cData.SizeOf) throw new Exception("Sizes don't match."); GL.BindBuffer(BufferTarget.ArrayBuffer, 0); cuda_vbo_resource = new CudaGraphicsInteropResourceCollection(); cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None)); texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); stopwatch = new CudaStopWatch(CUEventFlags.Default); reshape(); isInit = true; display(); }
public GpuMathOperations() { this.cuBlas = new CudaBlas(); this.cudaContext = new CudaContext(); this.cuModule = cudaContext.LoadModulePTX(kernalFile); this.maxThreadPerBlockDim = (int)Math.Sqrt(this.cudaContext.GetDeviceInfo().MaxThreadsPerBlock); }
private void BuildSelfTriMask(TSCudaContext context, Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue) { CudaContext cudaContext = context.CudaContextForTensor(originalLengths); cudaContext.SetCurrent(); int ndim = result.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides); long cols = result.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildSelfTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, originalLengthsPtr, rows, cols, paddedSeqLen, value, maskedValue); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); int ndim = src1.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src1.Sizes, src1.Strides); long cols = src1.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); int ndim = inGrad.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); long cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); CUdeviceptr out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr x1Ptr = CudaHelpers.GetBufferStart(x1); CUdeviceptr x2Ptr = CudaHelpers.GetBufferStart(x2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { CudaContext cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); int ndim = grad.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides); long cols = grad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; int iAddGrad = addGrad ? 1 : 0; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
// Testing getting device information via managedCuda private static void GetInformationAboutDevice() { // Number of devices var deviceCount = CudaContext.GetDeviceCount(); Console.WriteLine(deviceCount + " Devices"); if (deviceCount <= 0) { throw new Exception("No cuda device detected"); } // Pick device based on performance. var deviceByFlops = CudaContext.GetMaxGflopsDeviceId(); Console.WriteLine("Unit {0} has the most Gflops", deviceByFlops); var deviceProperties = CudaContext.GetDeviceInfo(deviceByFlops); Console.WriteLine("And has the following properties: "); Console.WriteLine(deviceProperties.DeviceName); Console.WriteLine("Can execute concurrent kernels: " + deviceProperties.ConcurrentKernels); Console.WriteLine("Multi processor count: " + deviceProperties.MultiProcessorCount); Console.WriteLine("Clockrate (mhz): " + (int)deviceProperties.ClockRate / 1000.0); Console.WriteLine("Total global memory (MB): " + deviceProperties.TotalGlobalMemory / 1000000); Console.WriteLine("Is integrated: " + deviceProperties.Integrated); Console.WriteLine("Max block dimension: " + deviceProperties.MaxGridDim); Console.WriteLine("Max block dimension: " + deviceProperties.MaxBlockDim); Console.WriteLine("Max threads per block: " + deviceProperties.MaxThreadsPerBlock); Console.WriteLine("Max threads per multiprocessor: " + deviceProperties.MaxThreadsPerMultiProcessor); Console.WriteLine("Max shared mem block can use (b): " + deviceProperties.SharedMemoryPerBlock); Console.WriteLine("If device can do mem copy and kernel execution: " + deviceProperties.GpuOverlap); Console.WriteLine("can map memory adress space on host and device: " + deviceProperties.CanMapHostMemory); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); long rows = inGrad.Sizes[0]; long cols = inGrad.Sizes[1]; int ndim = inGrad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); CUdeviceptr out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr x1Ptr = CudaHelpers.GetBufferStart(x1); CUdeviceptr x2Ptr = CudaHelpers.GetBufferStart(x2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
//private CudaKernel kernel1; //public Class1() //{ // //int deviceID = 0; // //CudaContext ctx = new CudaContext(deviceID); // //CUmodule cumodule = ctx.LoadModulePTX(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx"); // //kernel1 = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx); //} public static double[,] TestMatrix(double[][,] a) { using (CudaContext ctx = new CudaContext(0)) { CUmodule cumodule = ctx.LoadModule(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx"); var kernel = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx); int dimZ = a.Length; int dimX = a[0].GetLength(0); int dimY = a[0].GetLength(1); kernel.GridDimensions = new dim3(28, 28, 1); kernel.BlockDimensions = new dim3(1, 1, 1); //kernel.BlockDimensions = new dim3(dimX, dimY, 1); // Allocate vectors in device memory and copy vectors from host memory to device memory CudaDeviceVariable <double> dA = a.ToLinearArray(); //CudaDeviceVariable<double> dB = ToLinearArray(b); CudaDeviceVariable <double> dC = new CudaDeviceVariable <double>(dimX * dimY); // Invoke kernel kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ); // Copy result from device memory to host memory double[] c = dC; //ctx.FreeMemory(dC.DevicePointer); //ctx.FreeMemory(dA.DevicePointer); //ctx.Dispose(); return(ToMultyArray(c, dimX)); } }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); long rows = src1.Sizes[0]; long cols = src1.Sizes[1]; int ndim = src1.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src1.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); long rows = src.Sizes[0]; long cols = src.Sizes[1]; int ndim = src.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); long rows = weight.Sizes[0]; long cols = weight.Sizes[1]; int ndim = weight.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr cachePtr = CudaHelpers.GetBufferStart(cache); Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
public CUDAPrefixScan(CUmodule module, CudaContext context) { this.context = context; kernelScanExclusiveShared = new CudaKernel("scanExclusiveShared", module, context); kernelScanExclusiveShared2 = new CudaKernel("scanExclusiveShared2", module, context); kernelUniformUpdate = new CudaKernel("uniformUpdate", module, context); }
public static void blaa() { int num = 10; //NewContext creation CudaContext cntxt = new CudaContext(); //Module loading from precompiled .ptx in a project output folder CUmodule cumodule = cntxt.LoadModule("kernel.ptx"); //_Z9addKernelPf - function name, can be found in *.ptx file CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt); //Create device array for data CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num); //Create arrays with data float[] vec1 = new float[num]; //Copy data to device vec1_device.CopyToDevice(vec1); //Set grid and block dimensions addWithCuda.GridDimensions = new dim3(8, 1, 1); addWithCuda.BlockDimensions = new dim3(512, 1, 1); //Run the kernel addWithCuda.Run( vec1_device.DevicePointer); //Copy data from device vec1_device.CopyToHost(vec1); }
public PtrToMemory(CudaContext context, IDeviceMemoryPtr rootBlock, CUdeviceptr ptr, SizeT size) { _context = context; _ptr = new CudaDeviceVariable <float>(ptr, size); _rootBlock = rootBlock; rootBlock.AddRef(); }
public static float2[] calculateCudaFFT(float[] h_dataIn) { CudaContext cntxt = new CudaContext(); //Caution: Array sizes matter! Based on CUFFFT-Documentation... int size_real = h_dataIn.Length; int size_complex = (int)Math.Floor(size_real / 2.0) + 1; //Crating FFT Plan CudaFFTPlanMany fftPlan = new CudaFFTPlanMany(1, new int[] { size_real }, 1, cufftType.R2C); //Size of d_data must be padded for inplace R2C transforms: size_complex * 2 and not size_real CudaDeviceVariable <float> d_data = new CudaDeviceVariable <float>(size_complex * 2); //device allocation and host have different sizes, why the amount of data must be given explicitly for copying: d_data.CopyToDevice(h_dataIn, 0, 0, size_real * sizeof(float)); //executa plan fftPlan.Exec(d_data.DevicePointer, TransformDirection.Forward); //Output to host, either as float2 or float, but array sizes must be right! float2[] h_dataOut = new float2[size_complex]; float[] h_dataOut2 = new float[size_complex * 2]; d_data.CopyToHost(h_dataOut); d_data.CopyToHost(h_dataOut2); fftPlan.Dispose(); return(h_dataOut); }
public DenoiseAndDemoisaic(int tileSize, CudaContext ctx, CUmodule mod, bool UseCUDNN) { _tileSize = tileSize; start = new StartLayer(tileSize, tileSize, 3, 1); final = new FinalLayer(tileSize - 16, tileSize - 16, 3, 1, ctx, mod); if (UseCUDNN) { CudaDNNContext cuddn = new CudaDNNContext(); conv1 = new ConvolutionalLayer(tileSize, tileSize, 3, tileSize - 8, tileSize - 8, 64, 1, 9, 9, ConvolutionalLayer.Activation.PRelu, cuddn, ctx, mod); conv2 = new ConvolutionalLayer(tileSize - 8, tileSize - 8, 64, tileSize - 12, tileSize - 12, 64, 1, 5, 5, ConvolutionalLayer.Activation.PRelu, cuddn, ctx, mod); conv3 = new ConvolutionalLayer(tileSize - 12, tileSize - 12, 64, tileSize - 16, tileSize - 16, 3, 1, 5, 5, ConvolutionalLayer.Activation.None, cuddn, ctx, mod); start.ConnectFollowingLayer(conv1); conv1.ConnectFollowingLayer(conv2); conv2.ConnectFollowingLayer(conv3); conv3.ConnectFollowingLayer(final); } else { conv1NPP = new ConvolutionalLayerNPP(tileSize, tileSize, 3, tileSize - 8, tileSize - 8, 64, 1, 9, 9, ConvolutionalLayerNPP.Activation.PRelu, ctx, mod); conv2NPP = new ConvolutionalLayerNPP(tileSize - 8, tileSize - 8, 64, tileSize - 12, tileSize - 12, 64, 1, 5, 5, ConvolutionalLayerNPP.Activation.PRelu, ctx, mod); conv3NPP = new ConvolutionalLayerNPP(tileSize - 12, tileSize - 12, 64, tileSize - 16, tileSize - 16, 3, 1, 5, 5, ConvolutionalLayerNPP.Activation.None, ctx, mod); start.ConnectFollowingLayer(conv1NPP); conv1NPP.ConnectFollowingLayer(conv2NPP); conv2NPP.ConnectFollowingLayer(conv3NPP); conv3NPP.ConnectFollowingLayer(final); } tileAsPlanes = new CudaDeviceVariable <float>(tileSize * tileSize * 3); tile = new NPPImage_32fC3(tileSize, tileSize); }
public static void CalculateNeabours <T> (CudaContext context, DeviceDataSet <T> teaching, DeviceDataSet <T> test, CudaDeviceVariable <int> calculatedNeabours, int threadsPerBlock ) where T : struct { var kernel = context.LoadKernel("kernels/VectorReduction.ptx", "calculateNearestNeabours"); kernel.GridDimensions = test.length / threadsPerBlock + 1; kernel.BlockDimensions = threadsPerBlock; kernel.SetConstantVariable("testVectorsCount", test.length); kernel.SetConstantVariable("teachingVectorsCount", teaching.length); kernel.SetConstantVariable("attributeCount", teaching.attributeCount); using (var deviceDistanceMemory = new CudaDeviceVariable <float>(teaching.length * test.length)) { kernel.Run( teaching.vectors.DevicePointer, test.vectors.DevicePointer, deviceDistanceMemory.DevicePointer, calculatedNeabours.DevicePointer ); Thrust.sort_by_key_multiple(deviceDistanceMemory, calculatedNeabours, teaching.length, test.length); } }
public VectorReductionAccuracy(CudaContext context, DeviceDataSet <int> teaching, DeviceDataSet <int> test, int popSize) { this.teaching = teaching; this.test = test; this.popSize = popSize; this.context = context; calculatedNeabours = new CudaDeviceVariable <int>(teaching.length * test.length); deviceAccuracy = new CudaDeviceVariable <float>(popSize); Profiler.Start("calculate neabours"); Neabours.CalculateNeabours(context, teaching, test, calculatedNeabours, ThreadsPerBlock); Profiler.Stop("calculate neabours"); accuracyKernel = context.LoadKernel("kernels/VectorReduction.ptx", "calculateAccuracy"); dim3 gridDimension = new dim3() { x = (uint)(test.length / ThreadsPerBlock + 1), y = (uint)popSize, z = 1 }; accuracyKernel.GridDimensions = gridDimension; accuracyKernel.BlockDimensions = ThreadsPerBlock; accuracyKernel.SetConstantVariable("testVectorsCount", test.length); accuracyKernel.SetConstantVariable("teachingVectorsCount", teaching.length); accuracyKernel.SetConstantVariable("attributeCount", teaching.attributeCount); accuracyKernel.SetConstantVariable("genLength", teaching.length); K = 3; CountToPass = 2; }
private static dim3 GetContigReduceBlock(CudaContext cudaContext, long numSlices, long reductionSize) { // If the number of slices is low but the reduction dimension size // is high, then we should increase block size for greater parallelism. // Aim for at least 32 warps per SM (assume 15 SMs; don't bother // inquiring the real number for now). var smCount = 15; var maxWarps = 4; // better occupancy if many blocks are around // For numSlices > smCount * 8, there are > 32 warps active per SM. if (numSlices < smCount * 8) { maxWarps = 8; if (numSlices < smCount * 4) { maxWarps = 16; if (numSlices < smCount * 2) { maxWarps = 32; } } } // Scale up block size based on the reduction dimension size var warpsInReductionSize = ApplyUtils.CeilDiv(reductionSize, 32); var numWarps = warpsInReductionSize > maxWarps ? maxWarps : (int)warpsInReductionSize; var targetSize = numWarps * 32; targetSize = Math.Min(targetSize, (int)cudaContext.GetDeviceInfo().MaxBlockDim.x); return(new dim3(targetSize)); }
private int findGraphicsGPU(out string devName) { int nGraphicsGPU = 0; int deviceCount = 0; bool bFoundGraphics = false; string firstGraphicsName = string.Empty, temp; devName = string.Empty; deviceCount = CudaContext.GetDeviceCount(); // This function call returns 0 if there are no CUDA capable devices. if (deviceCount == 0) { Console.WriteLine("There are no device(s) supporting CUDA"); return(0); } else { Console.WriteLine("> Found " + deviceCount + " CUDA Capable Device(s)"); } for (int dev = 0; dev < deviceCount; dev++) { temp = CudaContext.GetDeviceName(dev); bool bGraphics = !temp.Contains("Tesla"); StringBuilder sb = new StringBuilder(); sb.Append("> "); if (bGraphics) { sb.Append("Graphics"); } else { sb.Append("Compute"); } sb.Append("\t\tGPU ").Append(dev).Append(": ").Append(CudaContext.GetDeviceName(dev)); Console.WriteLine(sb.ToString()); if (bGraphics) { if (!bFoundGraphics) { firstGraphicsName = temp; } nGraphicsGPU++; } } if (nGraphicsGPU != 0) { devName = firstGraphicsName; } else { devName = "this hardware"; } return(nGraphicsGPU); }
private void ReduceIndexOuterDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, int dimension, Tuple <float, float> init, string baseKernelName) { CudaContext cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_orows = 1; for (int dim = 0; dim < dimension; dim++) { num_orows *= src.Sizes[dim]; } long row_size = src.Sizes[dimension]; long num_irows = 1; for (int dim = dimension + 1; dim < ndim; dim++) { num_irows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_irows)); int maxGridDim = 1024; dim3 grid = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x))); CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues); CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); string kernelName = "outer_index_" + baseKernelName; Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_orows, num_irows, row_size, init.Item1, init.Item2); }
/// <summary> /// Initializes a new instance of the <see cref="TSCudaContext"/> class. /// </summary> public TSCudaContext() { try { this.deviceCount = CudaContext.GetDeviceCount(); } catch { // CudaContext.GetDeviceCount() throws if CUDA drivers are not installed this.deviceCount = 0; } this.devices = Enumerable.Repeat(0, deviceCount) .Select(x => new DeviceState(x)) .ToArray(); if (deviceCount > 0) { p2pAccess = EnablePeerAccess(devices.Select(x => x.CudaContext).ToArray(), devices[0].CudaContext); } else { p2pAccess = new bool[0, 0]; } this.diskCache = new RuntimeCompiler.KernelDiskCache(Path.Combine(Environment.CurrentDirectory, CacheDir)); this.compiler = new RuntimeCompiler.CudaCompiler(diskCache); OpRegistry.RegisterAssembly(Assembly.GetExecutingAssembly()); }
public CudaKernel Get(CudaContext context, byte[] ptx, string kernelName) { lock (locker) { try { if (activeKernels.TryGetValue(Tuple.Create(context, ptx, kernelName), out CudaKernel value)) { return(value); } else { value = context.LoadKernelPTX(ptx, kernelName); activeKernels.Add(Tuple.Create(context, ptx, kernelName), value); return(value); } } catch (Exception err) { Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Exception: '{err.Message}'"); Logger.WriteLine(Logger.Level.err, ConsoleColor.Red, $"Call stack: '{err.StackTrace}'"); throw err; } } }
static void InitKernels() { cntxt = new CudaContext(); CUmodule cumodule = cntxt.LoadModulePTX(@"C:\work\Sobel\CudaTest\x64\Debug\kernel.ptx"); matrixSumCude = new CudaKernel("_Z15matrixSumKernelPdPKdiii", cumodule, cntxt); }
// Testing managed CUDA call private static void RunCudaWithAKernel() { // C# Cuda code to call kernel int N = 50000; int deviceID = 0; CudaContext ctx = new CudaContext(deviceID); CudaKernel kernel = ctx.LoadKernel("kernel_x64.ptx", "VecAdd"); int numOfThreads = 256; kernel.GridDimensions = (N + numOfThreads - 1) / numOfThreads; kernel.BlockDimensions = numOfThreads; // allocate memory in host (not gpu) var h_A = InitWithData(N, numOfThreads * 4); var h_B = InitWithData(N, numOfThreads); // Allocate vectors in device memory and copy from host to device. CudaDeviceVariable <float> d_A = h_A; CudaDeviceVariable <float> d_B = h_B; CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(N); //Invoke kernel kernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N); Console.WriteLine("kernel has runeth"); //Copy from memory of device to host. float[] h_C = d_C; }
public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args) { ThrowIfAnyTensorInvalid(args); cudaContext.SetCurrent(); CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext); IEnumerable <Tensor> allTensors = args.OfType <Tensor>(); Tensor firstTensor = allTensors.First(); long elementCount = firstTensor.ElementCount(); ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray()); ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock(); ManagedCuda.VectorTypes.dim3 grid = ApplyUtils.GetApplyGrid(deviceInfo, elementCount); string fullKernelName = PermutationGenerator.GetMangledName(baseName, spec); CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, fullKernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.RunAsync(CUstream.NullStream, args); }
public void GenerateRandomNumbers() { CleanupResources(); //Init Cuda context ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); // Allocate input vectors h_A and h_B in host memory h_A = new float[Count]; h_B = new float[Count]; // Initialize input vectors RandomInit(h_A, Count); RandomInit(h_B, Count); // Allocate vectors in device memory and copy vectors from host memory to device memory // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation. d_A = h_A; d_B = h_B; //d_C = new CudaDeviceVariable<float>(Count); // Allocate Shared Memory. The GPU will write here // A = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global); // B = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global); C = new CudaManagedMemory_float(Count, CUmemAttach_flags.Global); }
/// <summary> /// Invokes the specified kernels. /// </summary> /// <param name="kernels">The kernels.</param> /// <param name="context">The context.</param> /// <param name="cudaContext">The cuda context.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> public static void Invoke(FillCopyKernels kernels, TSCudaContext context, CudaContext cudaContext, NDArray result, NDArray src) { var ptx = kernels.GetPtx(context.Compiler); var elementCount = result.ElementCount(); ApplyOpInvoke.Invoke(context, cudaContext, ptx, "copy", result, src, elementCount); }
// General GPU Device CUDA Initialization static int gpuDeviceInit(int devID) { int deviceCount = CudaContext.GetDeviceCount(); if (deviceCount == 0) { Console.Write("gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); Environment.Exit(-1); } if (devID < 0) { devID = 0; } if (devID > deviceCount - 1) { Console.Write("\n"); Console.Write(">> {0} CUDA capable GPU device(s) detected. <<\n", deviceCount); Console.Write(">> gpuDeviceInit (-device={0}) is not a valid GPU device. <<\n", devID); Console.Write("\n"); return(-devID); } if (CudaContext.GetDeviceComputeCapability(devID).Major < 1) { Console.Write("gpuDeviceInit(): GPU device does not support CUDA.\n"); Environment.Exit(-1); } ctx = new CudaContext(devID); Console.Write("> gpuDeviceInit() CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName()); return(devID); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { CudaContext cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); long rows = grad.Sizes[0]; long cols = grad.Sizes[1]; int iAddGrad = addGrad ? 1 : 0; int ndim = grad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
public static void InitKernels() { CudaContext cntxt = new CudaContext(); //CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Michał\Documents\Visual Studio 2013\Projects\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx"); CUmodule cumodule = cntxt.LoadModule(@"D:\Grafika\cuda\Projekt cuda\Projekt cuda\Debug\kernel.ptx"); addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt); }
public FxCuda(Boolean initUtils=false) { //Init Cuda context ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); // init utils if (initUtils) Utils = new CudaUtils(this); }
public CudaModuleHelper(CudaContext context, string file) { Context = context; Module = context.LoadModule(file); PtxFile = file; functionNames = File.ReadAllLines(file) .Where(x => x.Contains("// .globl")) .Select(x => x.Replace("// .globl", "").Trim()) .ToArray(); }
static void InitKernels() { //max thread number - 65534x256=16776704 _matrixSize = 256; _threadsPerBlock = 256; CleanUpResources(); _cnContext = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); CUmodule cumodule = _cnContext.LoadModule(@"\Kernel\kernel.ptx"); _multiplyTwoVectorWithCuda = new CudaKernel("_Z6kernel_", cumodule, _cnContext); }
public void ContextWithStream() { using ( var cuda = new CudaContext() ) using ( var stream = new CudaStream(CUStreamFlags.Default) ) { using (var context = CudnnContext.Create(stream)) { Assert.True(context.IsInitialized); var streamId = default (CUstream); CudnnContext.Invoke(() => CudnnNativeMethods.cudnnGetStream(context.Handle, out streamId)); Assert.Equal(stream.Stream, streamId); } } }
public uint[] Run() { var ptx = @"C:\Src\_Tree\SmallPrograms\Buddhabrot\Buddhabrot.Cuda70\x64\Release\Buddhabrot.ptx"; var context = new CudaContext(); var module = new CudaModuleHelper(context, ptx); var init = module.GetKernel("Init"); var setSettings = module.GetKernel("SetSettings"); var runBuddha = module.GetKernel("RunBuddha"); var nBlocks = 4196; var nThreads = 256; var dSettings = context.AllocateMemoryFor(settings); context.CopyToDevice(dSettings, settings); var array = new uint[settings.Width * settings.Height]; var dState = context.AllocateMemory(nThreads * nBlocks * SizeOfCurandState); var dArray = context.AllocateMemoryFor(array); context.CopyToDevice(dArray, array); init.Launch(nBlocks, nThreads, dState); setSettings.Launch(1, 1, dSettings); Console.WriteLine("Starting..."); var sw = Stopwatch.StartNew(); long i = 0; while (!IsStopping) { runBuddha.Launch(nBlocks, nThreads, dArray, dState); double count = (++i * nBlocks * nThreads); if (i % 5 == 0) { Console.WriteLine("Generated {0:0.0} Million samples in {1:0.000} sec", count / 1000000.0, sw.ElapsedMilliseconds / 1000.0); } if (maxSamples.HasValue && count >= maxSamples) break; } context.CopyToHost(array, dArray); return array; }
static void Test(byte[] ptxFile) { const int size = 16; var context = new CudaContext(); var kernel = context.LoadKernelPTX(ptxFile, "kernel"); var memory = context.AllocateMemory(4 * size); var gpuMemory = new CudaDeviceVariable<int>(memory); var cpuMemory = new int[size]; for (var i = 0; i < size; i++) cpuMemory[i] = i - 2; gpuMemory.CopyToDevice(cpuMemory); kernel.BlockDimensions = 4; kernel.GridDimensions = 4; kernel.Run(memory); gpuMemory.CopyToHost(cpuMemory); for (var i = 0; i < size; i++) Console.WriteLine("{0} = {1}", i, cpuMemory[i]); }
private void InitializeCUDA() { context = new CudaContext(CudaContext.GetMaxGflopsDevice(), graphicsDevice.ComPointer, CUCtxFlags.SchedAuto, CudaContext.DirectXVersion.D3D11); module = context.LoadModulePTX(@"Kernels\kernel.ptx"); kernelPositionWeightNoiseCube = new CudaKernel("position_weight_noise_cube", module, context); kernelNormalAmbient = new CudaKernel("normal_ambient", module, context); kernelMarchingCubesCases = new CudaKernel("marching_cubes_cases", module, context); kernelMarchingCubesVertices = new CudaKernel("marching_cubes_vertices", module, context); kernelPositionWeightNoiseCubeWarp = new CudaKernel("position_weight_noise_cube_warp", module, context); kernelPositionWeightFormula = new CudaKernel("position_weight_formula", module, context); prefixScan = new CUDAPrefixScan(module, context); }
// General GPU Device CUDA Initialization static int gpuDeviceInit(int devID) { int deviceCount = CudaContext.GetDeviceCount(); if (deviceCount == 0) { Console.Write("gpuDeviceInit() CUDA error: no devices supporting CUDA.\n"); Environment.Exit(-1); } if (devID < 0) devID = 0; if (devID > deviceCount - 1) { Console.Write("\n"); Console.Write(">> {0} CUDA capable GPU device(s) detected. <<\n", deviceCount); Console.Write(">> gpuDeviceInit (-device={0}) is not a valid GPU device. <<\n", devID); Console.Write("\n"); return -devID; } if (CudaContext.GetDeviceComputeCapability(devID).Major < 1) { Console.Write("gpuDeviceInit(): GPU device does not support CUDA.\n"); Environment.Exit(-1); } ctx = new CudaContext(devID); Console.Write("> gpuDeviceInit() CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName()); return devID; }
static void Main(string[] args) { int cuda_device = 0; int nstreams = 4; // number of streams for CUDA calls int nreps = 10; // number of times each experiment is repeated int n = 16 * 1024 * 1024; // number of ints in the data set int nbytes = n * sizeof(int); // number of data bytes dim3 threads, blocks; // kernel launch configuration float elapsed_time, time_memcpy, time_kernel; // timing variables float scale_factor = 1.0f; // allocate generic memory and pin it laster instead of using cudaHostAlloc() // Untested in C#, so stick to cudaHostAlloc(). bool bPinGenericMemory = false; // we want this to be the default behavior CUCtxFlags device_sync_method = CUCtxFlags.BlockingSync; // by default we use BlockingSync int niterations; // number of iterations for the loop inside the kernel ShrQATest.shrQAStart(args); Console.WriteLine("[ simpleStreams ]"); foreach (var item in args) { if (item.Contains("help")) { printHelp(); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_PASSED); } } bPinGenericMemory = false; foreach (var item in args) { if (item.Contains("use_generic_memory")) { bPinGenericMemory = true; } } for (int i = 0; i < args.Length; i++) { if (args[i].Contains("sync_method")) { int temp = -1; bool error = false; if (i < args.Length - 1) { error = int.TryParse(args[i + 1], out temp); switch (temp) { case 0: device_sync_method = CUCtxFlags.SchedAuto; break; case 1: device_sync_method = CUCtxFlags.SchedSpin; break; case 2: device_sync_method = CUCtxFlags.SchedYield; break; case 4: device_sync_method = CUCtxFlags.BlockingSync; break; default: error = true; break; } } if (!error) { Console.Write("Specifying device_sync_method = {0}, setting reps to 100 to demonstrate steady state\n", sDeviceSyncMethod[(int)device_sync_method]); nreps = 100; } else { Console.Write("Invalid command line option sync_method=\"{0}\"\n", temp); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } } } int num_devices = CudaContext.GetDeviceCount(); if(0==num_devices) { Console.Write("your system does not have a CUDA capable device, waiving test...\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); } cuda_device = CudaContext.GetMaxGflopsDeviceId(); CudaDeviceProperties deviceProp = CudaContext.GetDeviceInfo(cuda_device); if ((1 == deviceProp.ComputeCapability.Major) && (deviceProp.ComputeCapability.Minor < 1)) { Console.Write("{0} does not have Compute Capability 1.1 or newer. Reducing workload.\n", deviceProp.DeviceName); } if (deviceProp.ComputeCapability.Major >= 2) { niterations = 100; } else { if (deviceProp.ComputeCapability.Minor > 1) { niterations = 5; } else { niterations = 1; // reduced workload for compute capability 1.0 and 1.1 } } // Check if GPU can map host memory (Generic Method), if not then we override bPinGenericMemory to be false // In .net we cannot allocate easily generic aligned memory, so <bPinGenericMemory> is always false in our case... if (bPinGenericMemory) { Console.Write("Device: <{0}> canMapHostMemory: {1}\n", deviceProp.DeviceName, deviceProp.CanMapHostMemory ? "Yes" : "No"); if (deviceProp.CanMapHostMemory == false) { Console.Write("Using cudaMallocHost, CUDA device does not support mapping of generic host memory\n"); bPinGenericMemory = false; } } // Anything that is less than 32 Cores will have scaled down workload scale_factor = Math.Max((32.0f / (ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * (float)deviceProp.MultiProcessorCount)), 1.0f); n = (int)Math.Round((float)n / scale_factor); Console.Write("> CUDA Capable: SM {0}.{1} hardware\n", deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor); Console.Write("> {0} Multiprocessor(s) x {1} (Cores/Multiprocessor) = {2} (Cores)\n", deviceProp.MultiProcessorCount, ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor), ConvertSMVer2Cores(deviceProp.ComputeCapability.Major, deviceProp.ComputeCapability.Minor) * deviceProp.MultiProcessorCount); Console.Write("> scale_factor = {0:0.0000}\n", 1.0f / scale_factor); Console.Write("> array_size = {0}\n\n", n); // enable use of blocking sync, to reduce CPU usage Console.Write("> Using CPU/GPU Device Synchronization method ({0})\n", sDeviceSyncMethod[(int)device_sync_method]); CudaContext ctx; if (bPinGenericMemory) ctx = new CudaContext(cuda_device, device_sync_method | CUCtxFlags.MapHost); else ctx = new CudaContext(cuda_device, device_sync_method); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) resName = "simpleStreams_x64.ptx"; else resName = "simpleStreams.ptx"; string resNamespace = "simpleStreams"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) throw new ArgumentException("Kernel not found in resources."); CudaKernel init_array = ctx.LoadKernelPTX(stream, "init_array"); // allocate host memory int c = 5; // value to which the array will be initialized int[] h_a = null; // pointer to the array data in host memory CudaPageLockedHostMemory<int> hAligned_a = null; // pointer to the array data in host memory (aligned to MEMORY_ALIGNMENT) //Note: In .net we have two seperated arrays: One is in managed memory (h_a), the other one in unmanaged memory (hAligned_a). //In C++ hAligned_a would point somewhere inside the h_a array. AllocateHostMemory(bPinGenericMemory, ref h_a, ref hAligned_a, nbytes); Console.Write("\nStarting Test\n"); // allocate device memory CudaDeviceVariable<int> d_c = c; //using new implicit cast to allocate memory and asign value CudaDeviceVariable<int> d_a = new CudaDeviceVariable<int>(nbytes / sizeof(int)); CudaStream[] streams = new CudaStream[nstreams]; for (int i = 0; i < nstreams; i++) { streams[i] = new CudaStream(); } // create CUDA event handles // use blocking sync CudaEvent start_event, stop_event; CUEventFlags eventflags = ((device_sync_method == CUCtxFlags.BlockingSync) ? CUEventFlags.BlockingSync : CUEventFlags.Default); start_event = new CudaEvent(eventflags); stop_event = new CudaEvent(eventflags); // time memcopy from device start_event.Record(); // record in stream-0, to ensure that all previous CUDA calls have completed hAligned_a.AsyncCopyToDevice(d_a, streams[0].Stream); stop_event.Record(); stop_event.Synchronize(); // block until the event is actually recorded time_memcpy = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("memcopy:\t{0:0.00}\n", time_memcpy); // time kernel threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.RunAsync(streams[0].Stream, d_a.DevicePointer, d_c.DevicePointer, niterations); stop_event.Record(); stop_event.Synchronize(); time_kernel = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("kernel:\t\t{0:0.00}\n", time_kernel); ////////////////////////////////////////////////////////////////////// // time non-streamed execution for reference threads = new dim3(512, 1); blocks = new dim3(n / (int)threads.x, 1); start_event.Record(); for(int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; init_array.Run(d_a.DevicePointer, d_c.DevicePointer, niterations); hAligned_a.SynchronCopyToHost(d_a); } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("non-streamed:\t{0:0.00} ({1:00} expected)\n", elapsed_time / nreps, time_kernel + time_memcpy); ////////////////////////////////////////////////////////////////////// // time execution with nstreams streams threads = new dim3(512, 1); blocks = new dim3(n / (int)(nstreams * threads.x), 1); byte[] memset = new byte[nbytes]; // set host memory bits to all 1s, for testing correctness for (int i = 0; i < nbytes; i++) { memset[i] = 255; } System.Runtime.InteropServices.Marshal.Copy(memset, 0, hAligned_a.PinnedHostPointer, nbytes); d_a.Memset(0); // set device memory to all 0s, for testing correctness start_event.Record(); for(int k = 0; k < nreps; k++) { init_array.BlockDimensions = threads; init_array.GridDimensions = blocks; // asynchronously launch nstreams kernels, each operating on its own portion of data for(int i = 0; i < nstreams; i++) init_array.RunAsync(streams[i].Stream, d_a.DevicePointer + i * n / nstreams * sizeof(int), d_c.DevicePointer, niterations); // asynchronously launch nstreams memcopies. Note that memcopy in stream x will only // commence executing when all previous CUDA calls in stream x have completed for (int i = 0; i < nstreams; i++) hAligned_a.AsyncCopyFromDevice(d_a, i * n / nstreams * sizeof(int), i * n / nstreams * sizeof(int), nbytes / nstreams, streams[i].Stream); } stop_event.Record(); stop_event.Synchronize(); elapsed_time = CudaEvent.ElapsedTime(start_event, stop_event); Console.Write("{0} streams:\t{1:0.00} ({2:0.00} expected with compute capability 1.1 or later)\n", nstreams, elapsed_time / nreps, time_kernel + time_memcpy / nstreams); // check whether the output is correct Console.Write("-------------------------------\n"); //We can directly access data in hAligned_a using the [] operator, but copying //data first to h_a is faster. System.Runtime.InteropServices.Marshal.Copy(hAligned_a.PinnedHostPointer, h_a, 0, nbytes / sizeof(int)); bool bResults = correct_data(h_a, n, c*nreps*niterations); // release resources for(int i = 0; i < nstreams; i++) { streams[i].Dispose(); } start_event.Dispose(); stop_event.Dispose(); hAligned_a.Dispose(); d_a.Dispose(); d_c.Dispose(); CudaContext.ProfilerStop(); ctx.Dispose(); ShrQATest.shrQAFinishExit(args, bResults ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
private void InitializeD3D() { // Create the D3D object. d3d = new Direct3DEx(); PresentParameters pp = new PresentParameters(); pp.BackBufferWidth = 512; pp.BackBufferHeight = 512; pp.BackBufferFormat = Format.Unknown; pp.BackBufferCount = 0; pp.Multisample = MultisampleType.None; pp.MultisampleQuality = 0; pp.SwapEffect = SwapEffect.Discard; pp.DeviceWindowHandle = panel1.Handle; pp.Windowed = true; pp.EnableAutoDepthStencil = false; pp.AutoDepthStencilFormat = Format.Unknown; pp.PresentationInterval = PresentInterval.Default; bDeviceFound = false; CUdevice[] cudaDevices = null; for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++) { device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, panel1.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp); try { cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9); bDeviceFound = cudaDevices.Length > 0; Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA."); break; } catch (CudaException) { //No Cuda device found for this Direct3D9 device Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA."); } } // we check to make sure we have found a cuda-compatible D3D device to work on if (!bDeviceFound) { Console.WriteLine("No CUDA-compatible Direct3D9 device available"); if (device != null) device.Dispose(); Close(); return; } ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9); // Set projection matrix SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1); device.SetTransform(TransformState.Projection, matProj); // Turn off D3D lighting, since we are providing our own vertex colors device.SetRenderState(RenderState.Lighting, false); //Load kernels CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_k", module, ctx); }
public static void Initialize(CudaContext context, Device device) { _context = context; _device = device; }
static void Main(string[] args) { int SIGNAL_SIZE = 50; int FILTER_KERNEL_SIZE = 11; Console.WriteLine("[simpleCUFFT] is starting..."); var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleCUFFT.simpleCUFFTKernel.ptx"; CudaContext ctx = new CudaContext(0); CudaKernel ComplexPointwiseMulAndScale; string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { ComplexPointwiseMulAndScale = ctx.LoadKernelPTX(stream, "ComplexPointwiseMulAndScale"); } // Allocate host memory for the signal cuFloatComplex[] h_signal = new cuFloatComplex[SIGNAL_SIZE]; //we use cuFloatComplex for complex multiplaction in reference host code... Random rand = new Random(0); // Initialize the memory for the signal for (int i = 0; i < SIGNAL_SIZE; ++i) { h_signal[i].real = (float)rand.NextDouble(); h_signal[i].imag = 0; } // Allocate host memory for the filter cuFloatComplex[] h_filter_kernel = new cuFloatComplex[FILTER_KERNEL_SIZE]; // Initialize the memory for the filter for (int i = 0; i < FILTER_KERNEL_SIZE; ++i) { h_filter_kernel[i].real = (float)rand.NextDouble(); h_filter_kernel[i].imag = 0; } // Pad signal and filter kernel cuFloatComplex[] h_padded_signal = null; cuFloatComplex[] h_padded_filter_kernel = null; int new_size = PadData(h_signal, ref h_padded_signal, SIGNAL_SIZE, h_filter_kernel, ref h_padded_filter_kernel, FILTER_KERNEL_SIZE); int mem_size = (int)cuFloatComplex.SizeOf * new_size; // Allocate device memory for signal CudaDeviceVariable<cuFloatComplex> d_signal = new CudaDeviceVariable<cuFloatComplex>(new_size); // Copy host memory to device d_signal.CopyToDevice(h_padded_signal); // Allocate device memory for filter kernel CudaDeviceVariable<cuFloatComplex> d_filter_kernel = new CudaDeviceVariable<cuFloatComplex>(new_size); // Copy host memory to device d_filter_kernel.CopyToDevice(h_padded_filter_kernel); // CUFFT plan simple API CudaFFTPlan1D plan = new CudaFFTPlan1D(new_size, cufftType.C2C, 1); // Transform signal and kernel Console.WriteLine("Transforming signal cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Forward); plan.Exec(d_filter_kernel.DevicePointer, TransformDirection.Forward); // Multiply the coefficients together and normalize the result Console.WriteLine("Launching ComplexPointwiseMulAndScale<<< >>>"); ComplexPointwiseMulAndScale.BlockDimensions = 256; ComplexPointwiseMulAndScale.GridDimensions = 32; ComplexPointwiseMulAndScale.Run(d_signal.DevicePointer, d_filter_kernel.DevicePointer, new_size, 1.0f / new_size); // Transform signal back Console.WriteLine("Transforming signal back cufftExecC2C"); plan.Exec(d_signal.DevicePointer, TransformDirection.Inverse); // Copy device memory to host cuFloatComplex[] h_convolved_signal = d_signal; // Allocate host memory for the convolution result cuFloatComplex[] h_convolved_signal_ref = new cuFloatComplex[SIGNAL_SIZE]; // Convolve on the host Convolve(h_signal, SIGNAL_SIZE, h_filter_kernel, FILTER_KERNEL_SIZE, h_convolved_signal_ref); // check result bool bTestResult = sdkCompareL2fe(h_convolved_signal_ref, h_convolved_signal, 1e-5f); //Destroy CUFFT context plan.Dispose(); // cleanup memory d_filter_kernel.Dispose(); d_signal.Dispose(); ctx.Dispose(); if (bTestResult) { Console.WriteLine("Test Passed"); } else { Console.WriteLine("Test Failed"); } }
static void Main(string[] args) { var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleOccupancy.simpleOccupancy.ptx"; ctx = new CudaContext(0); string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { kernel = ctx.LoadKernelPTX(stream, "square"); } Console.WriteLine("starting Simple Occupancy"); Console.WriteLine(); Console.WriteLine("[ Manual configuration with {0} threads per block ]", manualBlockSize); int status = test(false); if (status != 0) { Console.WriteLine("Test failed"); return; } Console.WriteLine(); Console.WriteLine("[ Automatic, occupancy-based configuration ]"); status = test(true); if (status != 0) { Console.WriteLine("Test failed"); return; } Console.WriteLine(); Console.WriteLine("Test PASSED"); }
protected void InitContext() { var size = ParticlesCount * DimensionsCount; var threadsNum = 32; var blocksNum = ParticlesCount / threadsNum; Ctx = new CudaContext(0); UpdateVelocity = Ctx.LoadKernel("update_velocity_kernel.ptx", "updateVelocityKernel"); UpdateVelocity.GridDimensions = blocksNum; UpdateVelocity.BlockDimensions = threadsNum; Transpose = Ctx.LoadKernel(KernelFile, "transposeKernel"); Transpose.GridDimensions = blocksNum; Transpose.BlockDimensions = threadsNum; HostPositions = Random.RandomVector(size, -5.0, 5.0); HostVelocities = Random.RandomVector(size, -2.0, 2.0); HostPersonalBests = (double[]) HostPositions.Clone(); HostPersonalBestValues = Enumerable.Repeat(double.MaxValue,ParticlesCount).ToArray(); HostNeighbors = new int[ParticlesCount * 2]; for (var i = 0; i < ParticlesCount*2; i += 2) { int left, right; if (i == 0) left = ParticlesCount - 1; else left = i - 1; if (i == ParticlesCount - 1) right = 0; else right = i + 1; HostNeighbors[i] = left; HostNeighbors[i + 1] = right; } DevicePositions = HostPositions; DeviceVelocities = HostVelocities; DevicePersonalBests = HostPersonalBests; DevicePersonalBestValues = HostPersonalBestValues; DeviceNeighbors = HostNeighbors; Init(); }
protected void SetupCuda() { // Try to bind a CUDA context to the graphics card that WPF is working with. Adapter d3dAdapter = Device.Factory.GetAdapter(0); CUdevice[] cudaDevices = null; try { // Build a CUDA context from the first adapter in the used D3D11 device. cudaDevices = CudaContext.GetDirectXDevices(Device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D11); Debug.Assert(cudaDevices.Length > 0); Console.WriteLine("> Display Device #" + d3dAdapter + ": \"" + d3dAdapter.Description + "\" supports Direct3D11 and CUDA.\n"); } catch (CudaException) { // No Cuda device found for this Direct3D11 device. Console.Write("> Display Device #" + d3dAdapter + ": \"" + d3dAdapter.Description + "\" supports Direct3D11 but not CUDA.\n"); } ContextCuda = new CudaContext(cudaDevices[0], Device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D11); var info = ContextCuda.GetDeviceInfo(); Console.WriteLine("Max. Nr. Threads: " + info.MaxBlockDim + ", Total: " + info.MaxThreadsPerBlock + "\nMax. Nr. Blocks: " + info.MaxGridDim + "\nMax. Bytes Shared Per Block: " + info.SharedMemoryPerBlock); }
static void Main(string[] args) { string filename = "vectorAdd_kernel.cu"; //we assume the file is in the same folder... string fileToCompile = File.ReadAllText(filename); CudaRuntimeCompiler rtc = new CudaRuntimeCompiler(fileToCompile, "vectorAdd_kernel"); rtc.Compile(args); string log = rtc.GetLogAsString(); Console.WriteLine(log); byte[] ptx = rtc.GetPTX(); rtc.Dispose(); CudaContext ctx = new CudaContext(0); CudaKernel vectorAdd = ctx.LoadKernelPTX(ptx, "vectorAdd"); // Print the vector length to be used, and compute its size int numElements = 50000; SizeT size = numElements * sizeof(float); Console.WriteLine("[Vector addition of {0} elements]", numElements); // Allocate the host input vector A float[] h_A = new float[numElements]; // Allocate the host input vector B float[] h_B = new float[numElements]; // Allocate the host output vector C float[] h_C = new float[numElements]; Random rand = new Random(0); // Initialize the host input vectors for (int i = 0; i < numElements; ++i) { h_A[i] = (float)rand.NextDouble(); h_B[i] = (float)rand.NextDouble(); } Console.WriteLine("Allocate and copy input data from the host memory to the CUDA device\n"); // Allocate the device input vector A and copy to device CudaDeviceVariable<float> d_A = h_A; // Allocate the device input vector B and copy to device CudaDeviceVariable<float> d_B = h_B; // Allocate the device output vector C CudaDeviceVariable<float> d_C = new CudaDeviceVariable<float>(numElements); // Launch the Vector Add CUDA Kernel int threadsPerBlock = 256; int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock; Console.WriteLine("CUDA kernel launch with {0} blocks of {1} threads\n", blocksPerGrid, threadsPerBlock); vectorAdd.BlockDimensions = new dim3(threadsPerBlock,1, 1); vectorAdd.GridDimensions = new dim3(blocksPerGrid, 1, 1); vectorAdd.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, numElements); // Copy the device result vector in device memory to the host result vector // in host memory. Console.WriteLine("Copy output data from the CUDA device to the host memory\n"); d_C.CopyToHost(h_C); // Verify that the result vector is correct for (int i = 0; i < numElements; ++i) { if (Math.Abs(h_A[i] + h_B[i] - h_C[i]) > 1e-5) { Console.WriteLine("Result verification failed at element {0}!\n", i); return; } } Console.WriteLine("Test PASSED\n"); // Free device global memory d_A.Dispose(); d_B.Dispose(); d_C.Dispose(); ctx.Dispose(); Console.WriteLine("Done\n"); }
static void Main(string[] args) { // NOTE: You need to change this location to match your own machine. Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment."); Console.ResetColor(); System.Threading.Thread.Sleep(500); string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx"; CudaContext ctx = new CudaContext(); CUmodule module = ctx.LoadModule(path); kernel = new CudaKernel("kernel", module, ctx); // This tells the kernel to allocate a lot of threads for the Gpu. kernel.BlockDimensions = THREADS_PER_BLOCK; kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; ; // Now let's load the kernel! // Create the topology. int[] topology = new int[] { 1, 200, 200, 100, 1 }; int height = topology.Length; int width = 0; for (int i = 0; i < topology.Length; i++) if (width < topology[i]) width = topology[i]; // Launch! float[] res = new float[height * width]; for (int i = 0; i < 10; i++) { float[] matrix = new float[height * width]; float[] weights = new float[height * width]; Random rand = new Random(424242); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0; weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0; } } // Load the kernel with some variables. CudaDeviceVariable<int> cuda_topology = topology; CudaDeviceVariable<float> cuda_membank = matrix; CudaDeviceVariable<float> cuda_weights = weights; Stopwatch sw = new Stopwatch(); sw.Start(); kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width); cuda_membank.CopyToHost(res); sw.Stop(); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]); Console.ResetColor(); } Console.ReadKey(); }
public GpuContext() { this.ctx = new CudaContext(); }
static void InitKernels() { CudaContext cntxt = new CudaContext(); CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Niels\Documents\uni ting\P10\P10\programs\small programs\CUDA 1D MA in C Sharp\CUDA 1D MA in C Sharp\Debug\kernel.ptx"); addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt); }
public NVContext() { Context = new CudaContext(true); }
public void Compile() { using (var ctx = new CudaContext()) { // with verbaim string @, we only have to double up double quotes: no other escaping string source = @" extern ""C"" __global__ void saxpy(float a, float *x, float *y, float *out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { out[tid] = a * x[tid] + y[tid]; } } "; source += Environment.NewLine; var name = "Test"; var headers = new string[0]; var includeNames = new string[0]; var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames); //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames); // --ptxas-options=-v -keep compiler.Compile(new string[] { "-G" }); //var ptxString = compiler.GetPTXAsString(); // for debugging var ptx = compiler.GetPTX(); //compiler2.Compile(new string[] { }); var kernel = ctx.LoadKernelPTX(ptx, "kernelName"); //One kernel per cu file: //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname"); kernel.GridDimensions = new dim3(1, 1, 1); kernel.BlockDimensions = new dim3(16, 16); //kernel.Run() var a = new CudaDeviceVariable<double>(100); //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr() //Multiple kernels per cu file: CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx"); CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; } }
static void Main(string[] args) { ShrQATest.shrQAStart(args); Console.WriteLine("Vector Addition"); int N = 50000; //Init Cuda context ctx = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) resName = "vectorAdd_x64.ptx"; else resName = "vectorAdd.ptx"; string resNamespace = "vectorAdd"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) throw new ArgumentException("Kernel not found in resources."); CudaKernel vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd"); // Allocate input vectors h_A and h_B in host memory h_A = new float[N]; h_B = new float[N]; // Initialize input vectors RandomInit(h_A, N); RandomInit(h_B, N); // Allocate vectors in device memory and copy vectors from host memory to device memory // Notice the new syntax with implicit conversion operators: Allocation of device memory and data copy is one operation. d_A = h_A; d_B = h_B; d_C = new CudaDeviceVariable<float>(N); // Invoke kernel int threadsPerBlock = 256; vectorAddKernel.BlockDimensions = threadsPerBlock; vectorAddKernel.GridDimensions = (N + threadsPerBlock - 1) / threadsPerBlock; vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N); // Copy result from device memory to host memory // h_C contains the result in host memory h_C = d_C; // Verify result int i; for (i = 0; i < N; ++i) { float sum = h_A[i] + h_B[i]; if (Math.Abs(h_C[i] - sum) > 1e-5) break; } CleanupResources(); ShrQATest.shrQAFinishExit(args, i == N ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
private bool InitializeD3D() { HwndSource hwnd = new HwndSource(0, 0, 0, 0, 0, "null", IntPtr.Zero); // Create the D3D object. d3d = new Direct3DEx(); PresentParameters pp = new PresentParameters(); pp.BackBufferWidth = 512; pp.BackBufferHeight = 512; pp.BackBufferFormat = Format.Unknown; pp.BackBufferCount = 0; pp.Multisample = MultisampleType.None; pp.MultisampleQuality = 0; pp.SwapEffect = SwapEffect.Discard; pp.DeviceWindowHandle = (IntPtr)0; pp.Windowed = true; pp.EnableAutoDepthStencil = false; pp.AutoDepthStencilFormat = Format.Unknown; pp.PresentationInterval = PresentInterval.Default; bDeviceFound = false; CUdevice[] cudaDevices = null; for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++) { device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, hwnd.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp); try { cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9); bDeviceFound = cudaDevices.Length > 0; infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.\n"); break; } catch (CudaException) { //No Cuda device found for this Direct3D9 device infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.\n"); } } // we check to make sure we have found a cuda-compatible D3D device to work on if (!bDeviceFound) { infoLog.AppendText("No CUDA-compatible Direct3D9 device available"); if (device != null) device.Dispose(); return false; } ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9); deviceName.Text = "Device name: " + ctx.GetDeviceName(); // Set projection matrix SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1); device.SetTransform(TransformState.Projection, matProj); // Turn off D3D lighting, since we are providing our own vertex colors device.SetRenderState(RenderState.Lighting, false); //Load kernels CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_k", module, ctx); d3dimage.Lock(); Surface surf = device.GetBackBuffer(0, 0); d3dimage.SetBackBuffer(D3DResourceType.IDirect3DSurface9, surf.ComPointer); d3dimage.Unlock(); surf.Dispose(); //Setup the "real" frame rate counter. //The cuda counter only measures cuda runtime, not the overhead to actually //show the result via DirectX and WPF. realLastTick = Environment.TickCount; return true; }
// Initialization code to find the best CUDA Device static int findCudaDevice(string[] args) { int devID = 0; // If the command-line has a device number specified, use it bool found = false; foreach (var item in args) { if (item.Contains("device=")) { found = true; if (!int.TryParse(item, out devID)) { Console.WriteLine("Invalid command line parameters"); Environment.Exit(-1); } if (devID < 0) { Console.WriteLine("Invalid command line parameters\n"); Environment.Exit(-1); } else { devID = gpuDeviceInit(devID); if (devID < 0) { Console.WriteLine("exiting...\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_FAILED); Environment.Exit(-1); } } } } if (!found) { // Otherwise pick the device with highest Gflops/s devID = CudaContext.GetMaxGflopsDeviceId(); ctx = new CudaContext(devID, CUCtxFlags.SchedAuto); Console.Write("> Using CUDA device [{0}]: {1}\n", devID, ctx.GetDeviceName()); } return devID; }