private void initGLAndCuda() { //Create render target control m_renderControl = new OpenTK.GLControl(GraphicsMode.Default, 1, 0, GraphicsContextFlags.Default); m_renderControl.Dock = DockStyle.Fill; m_renderControl.BackColor = Color.White; m_renderControl.BorderStyle = BorderStyle.FixedSingle; m_renderControl.KeyDown += new KeyEventHandler(m_renderControl_KeyDown); m_renderControl.MouseMove += new MouseEventHandler(m_renderControl_MouseMove); m_renderControl.MouseDown += new MouseEventHandler(m_renderControl_MouseDown); m_renderControl.SizeChanged += new EventHandler(m_renderControl_SizeChanged); panel1.Controls.Add(m_renderControl); Console.WriteLine(" OpenGL device is Available"); int deviceID = CudaContext.GetMaxGflopsDeviceId(); ctx = CudaContext.CreateOpenGLContext(deviceID, CUCtxFlags.BlockingSync); string console = string.Format("CUDA device [{0}] has {1} Multi-Processors", ctx.GetDeviceName(), ctx.GetDeviceInfo().MultiProcessorCount); Console.WriteLine(console); CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_OGL", module, ctx); hvfield = new cData[DS]; dvfield = new CudaPitchedDeviceVariable<cData>(DIM, DIM); tPitch = dvfield.Pitch; dvfield.CopyToDevice(hvfield); vxfield = new CudaDeviceVariable<cData>(DS); vyfield = new CudaDeviceVariable<cData>(DS); // Create particle array particles = new cData[DS]; initParticles(particles, DIM, DIM); // TODO: update kernels to use the new unpadded memory layout for perf // rather than the old FFTW-compatible layout planr2c = new CudaFFTPlan2D(DIM, DIM, cufftType.R2C, Compatibility.FFTWPadding); planc2r = new CudaFFTPlan2D(DIM, DIM, cufftType.C2R, Compatibility.FFTWPadding); GL.GenBuffers(1, out vbo); GL.BindBuffer(BufferTarget.ArrayBuffer, vbo); GL.BufferData<cData>(BufferTarget.ArrayBuffer, new IntPtr(cData.SizeOf * DS), particles, BufferUsageHint.DynamicDraw); int bsize; GL.GetBufferParameter(BufferTarget.ArrayBuffer, BufferParameterName.BufferSize, out bsize); if (bsize != DS * cData.SizeOf) throw new Exception("Sizes don't match."); GL.BindBuffer(BufferTarget.ArrayBuffer, 0); cuda_vbo_resource = new CudaGraphicsInteropResourceCollection(); cuda_vbo_resource.Add(new CudaOpenGLBufferInteropResource(vbo, CUGraphicsRegisterFlags.None)); texref = new CudaTextureArray2D(advectVelocity_k, "texref", CUAddressMode.Wrap, CUFilterMode.Linear, 0, CUArrayFormat.Float, DIM, DIM, CudaArray2DNumChannels.Two); stopwatch = new CudaStopWatch(CUEventFlags.Default); reshape(); isInit = true; display(); }
public DimensionReductionFitness( CudaContext context, IDimensionAccuracy accuracyFunc, int popSize, int genLength ) { this.accuracyFunc = accuracyFunc; this.popSize = popSize; this.context = context; deviceVectorSizes = new CudaDeviceVariable <int>(popSize); fitnessKernel = context.LoadKernel( "kernels/dimensionsReductions.ptx", "fitnessFunction" ); fitnessKernel.GridDimensions = 1; fitnessKernel.BlockDimensions = popSize; Alpha = 0.7f; sizeAndIndecesKernel = context.LoadKernel("kernels/Common.ptx", "countVectorsIndeces"); sizeAndIndecesKernel.SetConstantVariable("genLength", genLength); sizeAndIndecesKernel.GridDimensions = 1; sizeAndIndecesKernel.BlockDimensions = popSize; populationIndeces = new CudaDeviceVariable <int>(genLength * popSize); }
internal CudaError LaunchKernelWithStreamBinding( CudaStream stream, CudaKernel kernel, RuntimeKernelConfig config, IntPtr args, IntPtr kernelArgs) { var binding = stream.BindScoped(); var result = LaunchKernel( kernel.FunctionPtr, config.GridDim.X, config.GridDim.Y, config.GridDim.Z, config.GroupDim.X, config.GroupDim.Y, config.GroupDim.Z, config.SharedMemoryConfig.DynamicArraySize, stream.StreamPtr, args, kernelArgs); binding.Recover(); return(result); }
//Test CUDA kernel for complex multiplication public void test(int N) { CudaContext ctx = new CudaContext(); CudaKernel kernel = ctx.LoadKernel("kernel.ptx", "ComplexMultCUDA"); kernel.GridDimensions = N; kernel.BlockDimensions = 1; double2[] a = new double2[N]; double2[] b = new double2[N]; double2[] c = new double2[N]; for (int i = 0; i < N; i++) { a[i].x = 1; a[i].y = 3; b[i].x = 2; b[i].y = 2; } CudaDeviceVariable <double2> d_a = null; CudaDeviceVariable <double2> d_b = null; try { d_a = a; d_b = b; } catch (Exception e) { Console.WriteLine("{0} Exception caught.", e); return; } kernel.Run(d_a.DevicePointer, d_b.DevicePointer, N); c = d_b; Console.WriteLine("C.last()={0}+i{1}", c.Last().x, c.Last().y); }
private nvrtcResult LoadKernel(string kernelSourceFile, out CudaKernel kernel) { nvrtcResult result; kernel = null; using (var compiler = new CudaRuntimeCompiler(File.ReadAllText(kernelSourceFile), Path.GetFileName(kernelSourceFile))) { try { compiler.Compile(new string[0]); result = nvrtcResult.Success; } catch (NVRTCException ex) { result = ex.NVRTCError; } var outputFileWithoutExt = Path.Combine(Path.GetDirectoryName(kernelSourceFile), Path.GetFileNameWithoutExtension(kernelSourceFile)); File.WriteAllText(outputFileWithoutExt + ".ptx.log", compiler.GetLogAsString()); if (result == nvrtcResult.Success) { var ptx = compiler.GetPTX(); kernel = _CudaContext.LoadKernelFatBin(ptx, "Run"); File.WriteAllBytes(outputFileWithoutExt + ".ptx", ptx); } } return(result); }
public Layer(Int3 size, Layer prev, ref CudaContext ctx, int type) { this.ctx = ctx; this.type = type; this.size = size; data = new float[size.Mul]; bias = new float[size.Mul]; error = new float[size.Mul]; generateWeights(size, prev.size, kernelType.fullyConnected); forward = ctx.LoadKernel("kernel.ptx", "Forward"); forward.GridDimensions = new dim3(size.x, size.y, size.z); forward.BlockDimensions = new dim3(prev.size.x, prev.size.y, prev.size.z); back = ctx.LoadKernel("kernel.ptx", "Backprop"); back.GridDimensions = new dim3(size.x, size.y, size.z); back.BlockDimensions = new dim3(prev.size.x, prev.size.y, prev.size.z); clear = ctx.LoadKernel("kernel.ptx", "Clear"); clear.GridDimensions = new dim3(size.x, size.y, size.z); activate = ctx.LoadKernel("kernel.ptx", "Activate"); activate.GridDimensions = new dim3(size.x, size.y, size.z); SoftmaxSigma = ctx.LoadKernel("kernel.ptx", "SoftmaxSigma"); SoftmaxSigma.GridDimensions = new dim3(size.x, size.y, size.z); SoftmaxFinal = ctx.LoadKernel("kernel.ptx", "SoftmaxFinal"); SoftmaxFinal.BlockDimensions = new dim3(size.x, size.y, size.z); SoftmaxVal = new float[] { 0 }; }
public Layer(FileLayer fl, ref CudaContext ctx) { this.ctx = ctx; type = fl.type; size = fl.size; data = new float[fl.size.Mul]; bias = new float[fl.size.Mul]; error = new float[fl.size.Mul]; forward = ctx.LoadKernel("kernel.ptx", "Forward"); forward.GridDimensions = new dim3(size.x, size.y, size.z); forward.BlockDimensions = new dim3(fl.prevSize.x, fl.prevSize.y, fl.prevSize.z); back = ctx.LoadKernel("kernel.ptx", "Backprop"); back.GridDimensions = new dim3(size.x, size.y, size.z); back.BlockDimensions = new dim3(fl.prevSize.x, fl.prevSize.y, fl.prevSize.z); clear = ctx.LoadKernel("kernel.ptx", "Clear"); clear.GridDimensions = new dim3(size.x, size.y, size.z); activate = ctx.LoadKernel("kernel.ptx", "Activate"); activate.GridDimensions = new dim3(size.x, size.y, size.z); }
/// <summary> /// Creates a new surface from array memory. Allocates new array. /// </summary> /// <param name="kernel"></param> /// <param name="surfName"></param> /// <param name="flags"></param> /// <param name="format"></param> /// <param name="width">In elements</param> /// <param name="height">In elements</param> /// <param name="depth">In elements</param> /// <param name="numChannels"></param> /// <param name="arrayFlags"></param> public CudaSurface(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, SizeT depth, CudaArray3DNumChannels numChannels, CUDAArray3DFlags arrayFlags) { _surfref = new CUsurfref(); res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref _surfref, kernel.CUModule, surfName); Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName)); if (res != CUResult.Success) throw new CudaException(res); _flags = flags; _format = format; _height = height; _width = width; _depth = depth; _numChannels = (int)numChannels; _name = surfName; _module = kernel.CUModule; _cufunction = kernel.CUFunction; _channelSize = CudaHelperMethods.GetChannelSize(format); _dataSize = height * width * depth * _numChannels * _channelSize; _array = new CudaArray3D(format, width, height, depth, numChannels, arrayFlags); res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(_surfref, _array.CUArray, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res)); if (res != CUResult.Success) throw new CudaException(res); }
//private CudaKernel kernel1; //public Class1() //{ // //int deviceID = 0; // //CudaContext ctx = new CudaContext(deviceID); // //CUmodule cumodule = ctx.LoadModulePTX(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx"); // //kernel1 = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx); //} public static double[,] TestMatrix(double[][,] a) { using (CudaContext ctx = new CudaContext(0)) { CUmodule cumodule = ctx.LoadModule(@"C:\work\Sobel\TestCuda\x64\Debug\kernel.ptx"); var kernel = new CudaKernel("_Z9matrixSumPdS_iii", cumodule, ctx); int dimZ = a.Length; int dimX = a[0].GetLength(0); int dimY = a[0].GetLength(1); kernel.GridDimensions = new dim3(28, 28, 1); kernel.BlockDimensions = new dim3(1, 1, 1); //kernel.BlockDimensions = new dim3(dimX, dimY, 1); // Allocate vectors in device memory and copy vectors from host memory to device memory CudaDeviceVariable <double> dA = a.ToLinearArray(); //CudaDeviceVariable<double> dB = ToLinearArray(b); CudaDeviceVariable <double> dC = new CudaDeviceVariable <double>(dimX * dimY); // Invoke kernel kernel.Run(dA.DevicePointer, dC.DevicePointer, dimX, dimY, dimZ); // Copy result from device memory to host memory double[] c = dC; //ctx.FreeMemory(dC.DevicePointer); //ctx.FreeMemory(dA.DevicePointer); //ctx.Dispose(); return(ToMultyArray(c, dimX)); } }
public static void blaa() { int num = 10; //NewContext creation CudaContext cntxt = new CudaContext(); //Module loading from precompiled .ptx in a project output folder CUmodule cumodule = cntxt.LoadModule("kernel.ptx"); //_Z9addKernelPf - function name, can be found in *.ptx file CudaKernel addWithCuda = new CudaKernel("_Z9addKernelPf", cumodule, cntxt); //Create device array for data CudaDeviceVariable <float> vec1_device = new CudaDeviceVariable <float>(num); //Create arrays with data float[] vec1 = new float[num]; //Copy data to device vec1_device.CopyToDevice(vec1); //Set grid and block dimensions addWithCuda.GridDimensions = new dim3(8, 1, 1); addWithCuda.BlockDimensions = new dim3(512, 1, 1); //Run the kernel addWithCuda.Run( vec1_device.DevicePointer); //Copy data from device vec1_device.CopyToHost(vec1); }
/// <summary> /// Creates a new mipmapped texture from array memory. Allocates a new mipmapped array. /// </summary> /// <param name="kernel"></param> /// <param name="texName"></param> /// <param name="addressModeForAllDimensions"></param> /// <param name="filterMode"></param> /// <param name="flags"></param> /// <param name="descriptor"></param> /// <param name="numMipmapLevels"></param> /// <param name="maxAniso"></param> /// <param name="mipmapFilterMode"></param> /// <param name="mipmapLevelBias"></param> /// <param name="minMipmapLevelClamp"></param> /// <param name="maxMipmapLevelClamp"></param> public CudaTextureMipmappedArray(CudaKernel kernel, string texName, CUAddressMode addressModeForAllDimensions, CUFilterMode filterMode, CUTexRefSetFlags flags, CUDAArray3DDescriptor descriptor, uint numMipmapLevels, uint maxAniso, CUFilterMode mipmapFilterMode, float mipmapLevelBias, float minMipmapLevelClamp, float maxMipmapLevelClamp) : this(kernel, texName, addressModeForAllDimensions, addressModeForAllDimensions, addressModeForAllDimensions, filterMode, flags, descriptor, numMipmapLevels, maxAniso, mipmapFilterMode, mipmapLevelBias, minMipmapLevelClamp, maxMipmapLevelClamp) { }
/// <summary> /// Creates a new surface from array memory. /// </summary> /// <param name="kernel"></param> /// <param name="surfName"></param> /// <param name="flags"></param> /// <param name="array"></param> public CudaSurface(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CudaArray3D array) { _surfref = new CUsurfref(); res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref _surfref, kernel.CUModule, surfName); Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName)); if (res != CUResult.Success) { throw new CudaException(res); } _flags = flags; _format = array.Array3DDescriptor.Format; _height = array.Height; _width = array.Width; _depth = array.Depth; _numChannels = (int)array.Array3DDescriptor.NumChannels; _name = surfName; _module = kernel.CUModule; _cufunction = kernel.CUFunction; _channelSize = CudaHelperMethods.GetChannelSize(array.Array3DDescriptor.Format); _dataSize = array.Height * array.Width * array.Depth * array.Array3DDescriptor.NumChannels * _channelSize; _array = array; res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(_surfref, _array.CUArray, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res)); if (res != CUResult.Success) { throw new CudaException(res); } _isOwner = false; }
public CUDAPrefixScan(CUmodule module, CudaContext context) { this.context = context; kernelScanExclusiveShared = new CudaKernel("scanExclusiveShared", module, context); kernelScanExclusiveShared2 = new CudaKernel("scanExclusiveShared2", module, context); kernelUniformUpdate = new CudaKernel("uniformUpdate", module, context); }
public override void DqnStanfordEvaluation(Matrix predictedActionIndices, Matrix chosenActionIndices, Matrix currentRewards, Matrix matchPredictRewards, Matrix nonMatchPredictRewards, bool copyInputsFromCpuToGpu = false, bool copyOutputsFromGpuToCpu = false) { this.VerifyDimentionalityOfMatrices(predictedActionIndices, chosenActionIndices, currentRewards); this.VerifyDimentionalityOfMatrices(currentRewards, matchPredictRewards, nonMatchPredictRewards); if (copyInputsFromCpuToGpu) { predictedActionIndices.CopyToCuda(); chosenActionIndices.CopyToCuda(); currentRewards.CopyToCuda(); } CudaKernel kernel = InitializeGridsAndThreads("_Z21DqnStanfordEvaluationPfS_S_S_S_i", matchPredictRewards); kernel.Run(predictedActionIndices.DeviceData.DevicePointer, chosenActionIndices.DeviceData.DevicePointer, currentRewards.DeviceData.DevicePointer, matchPredictRewards.DeviceData.DevicePointer, nonMatchPredictRewards.DeviceData.DevicePointer, matchPredictRewards.Row); if (copyOutputsFromGpuToCpu) { matchPredictRewards.CopyToCuda(); nonMatchPredictRewards.CopyToCuda(); } }
/// <summary> /// Creates a new surface from array memory. Allocates new array. /// </summary> /// <param name="kernel"></param> /// <param name="surfName"></param> /// <param name="flags"></param> /// <param name="format"></param> /// <param name="width">In elements</param> /// <param name="height">In elements</param> /// <param name="depth">In elements</param> /// <param name="numChannels"></param> /// <param name="arrayFlags"></param> public CudaSurface(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, SizeT depth, CudaArray3DNumChannels numChannels, CUDAArray3DFlags arrayFlags) { _surfref = new CUsurfref(); res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref _surfref, kernel.CUModule, surfName); Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName)); if (res != CUResult.Success) { throw new CudaException(res); } _flags = flags; _format = format; _height = height; _width = width; _depth = depth; _numChannels = (int)numChannels; _name = surfName; _module = kernel.CUModule; _cufunction = kernel.CUFunction; _channelSize = CudaHelperMethods.GetChannelSize(format); _dataSize = height * width * depth * _numChannels * _channelSize; _array = new CudaArray3D(format, width, height, depth, numChannels, arrayFlags); res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(_surfref, _array.CUArray, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res)); if (res != CUResult.Success) { throw new CudaException(res); } _isOwner = true; }
private nvrtcResult LoadKernel(string path, out CudaKernel kernel, out string log) { nvrtcResult result; using (var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path))) { try { rtc.Compile(new string[0]); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options result = nvrtcResult.Success; } catch (NVRTCException ex) { result = ex.NVRTCError; } log = rtc.GetLogAsString(); if (result == nvrtcResult.Success) { var ptx = rtc.GetPTX(); kernel = this._context.CudaContext.LoadKernelFatBin(ptx, "Run"); // hard-coded method name from the CUDA kernel } else { kernel = null; } } return(result); }
public override void MatrixBellmanErrorAndDerivative(Matrix predictedQValues, Matrix maxQHatValues, Matrix chosenActionIndices, Matrix currentRewards, Matrix error, Matrix errorDerivative, float discount, Matrix isLastEpisode, bool copyInputsFromCpuToGpu = false, bool copyOutputsFromGpuToCpu = false) { this.VerifyDimentionalityOfMatrices(predictedQValues, errorDerivative); this.VerifyColumnWithRowOfMatrices(predictedQValues, maxQHatValues); this.VerifyDimentionalityOfMatrices(maxQHatValues, chosenActionIndices, currentRewards); this.VerifyDimentionalityOfMatrices(currentRewards, error, isLastEpisode); if (copyInputsFromCpuToGpu) { predictedQValues.CopyToCuda(); maxQHatValues.CopyToCuda(); } CudaKernel kernel = InitializeGridsAndThreads("_Z26matrixBellmanErrorAndDerivPfS_S_S_S_S_fS_ii", errorDerivative); kernel.Run(predictedQValues.DeviceData.DevicePointer, maxQHatValues.DeviceData.DevicePointer, chosenActionIndices.DeviceData.DevicePointer, currentRewards.DeviceData.DevicePointer, error.DeviceData.DevicePointer, errorDerivative.DeviceData.DevicePointer, discount, isLastEpisode.DeviceData.DevicePointer, errorDerivative.Row, errorDerivative.Column); if (copyOutputsFromGpuToCpu) { error.CopyFromCuda(); errorDerivative.CopyFromCuda(); } }
private void RunKernel(Volume <T> input, Volume <T> output, CudaKernel kernel, params object[] extraParameters) { if (!(input.Storage is IVolumeStorage <T> inputStorage)) { throw new ArgumentException($"{nameof(input)} storage should be VolumeStorage", nameof(input)); } if (!(output.Storage is IVolumeStorage <T> outputStorage)) { throw new ArgumentException($"{nameof(output)} storage should be VolumeStorage", nameof(output)); } inputStorage.CopyToDevice(); outputStorage.CopyToDevice(); var count = (int)output.Shape.TotalLength; var parameters = new object[] { inputStorage.DeviceBuffer.DevicePointer, outputStorage.DeviceBuffer.DevicePointer }; if (extraParameters != null) { parameters = parameters.Concat(extraParameters).ToArray(); } this.RunKernel(kernel, count, parameters); }
public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args) { ThrowIfAnyTensorInvalid(args); cudaContext.SetCurrent(); CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext); IEnumerable <Tensor> allTensors = args.OfType <Tensor>(); Tensor firstTensor = allTensors.First(); long elementCount = firstTensor.ElementCount(); ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray()); ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock(); ManagedCuda.VectorTypes.dim3 grid = ApplyUtils.GetApplyGrid(deviceInfo, elementCount); string fullKernelName = PermutationGenerator.GetMangledName(baseName, spec); CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, fullKernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.RunAsync(CUstream.NullStream, args); }
static void InitKernels() { cntxt = new CudaContext(); CUmodule cumodule = cntxt.LoadModulePTX(@"C:\work\Sobel\CudaTest\x64\Debug\kernel.ptx"); matrixSumCude = new CudaKernel("_Z15matrixSumKernelPdPKdiii", cumodule, cntxt); }
public VectorReductionAccuracy(CudaContext context, DeviceDataSet <int> teaching, DeviceDataSet <int> test, int popSize) { this.teaching = teaching; this.test = test; this.popSize = popSize; this.context = context; calculatedNeabours = new CudaDeviceVariable <int>(teaching.length * test.length); deviceAccuracy = new CudaDeviceVariable <float>(popSize); Profiler.Start("calculate neabours"); Neabours.CalculateNeabours(context, teaching, test, calculatedNeabours, ThreadsPerBlock); Profiler.Stop("calculate neabours"); accuracyKernel = context.LoadKernel("kernels/VectorReduction.ptx", "calculateAccuracy"); dim3 gridDimension = new dim3() { x = (uint)(test.length / ThreadsPerBlock + 1), y = (uint)popSize, z = 1 }; accuracyKernel.GridDimensions = gridDimension; accuracyKernel.BlockDimensions = ThreadsPerBlock; accuracyKernel.SetConstantVariable("testVectorsCount", test.length); accuracyKernel.SetConstantVariable("teachingVectorsCount", teaching.length); accuracyKernel.SetConstantVariable("attributeCount", teaching.attributeCount); accuracyKernel.SetConstantVariable("genLength", teaching.length); K = 3; CountToPass = 2; }
public void CreateKernelObjects() { kernels.Clear(); int funcNo = 0; foreach (var sc in sourceCodes) { for (int ki = 0; ki < (grm.kernelPerIndividual? sc.numberOfIndividuals:1); ki++) { CudaKernel kernel = new CudaKernel("createdFunc" + (grm.kernelPerIndividual ? funcNo++ : 0).ToString(), sc.mod, ctx); if (NUMTESTCASE > 256) { // --- multi dim block if testcases > 256 kernel.GridDimensions = NUMTESTCASE / 256; kernel.BlockDimensions = 256; } else { kernel.GridDimensions = 1; kernel.BlockDimensions = NUMTESTCASE; } if (ki == 0) // <-- this is due to managedcuda not implementing setconstantvar as a module method! { grm.SetKernelParameters(kernel); } kernels.Add(kernel); } } }
public CudaIntersectionDevice(RayEngineScene scene, NVContext ctx) : base(scene) { wallclock = new Stopwatch(); this.todoRayBuffers = new ConcurrentQueue<Tuple<int, RayBuffer>>(); this.doneRayBuffers = new List<ConcurrentQueue<RayBuffer>>() { { new ConcurrentQueue<RayBuffer>() } }; this.started = false; if (ctx != null) { this.cudaContext = ctx; } else { this.cudaContext = new NVContext() { Context = new CudaContext(CudaContext.GetMaxGflopsDeviceId()) }; } using (var sr = new StreamReader(@"G:\Git\RayDen\CudaMegaRay\x64\Release\kernel.cu.ptx")) { intersectKernel = cudaContext.Context.LoadKernelPTX(sr.BaseStream, "IntersectLBvh"); } this.rays = new CudaDeviceVariable<RayData>(RayBuffer.RayBufferSize); this.hits = new CudaDeviceVariable<RayHit>(RayBuffer.RayBufferSize); verts = scene.Vertices.ToArray(); //scene.Triangles.Select(i => i.GetInfo()).ToArray(); var ti = scene.Triangles.Select(i => i.GetInfo()).ToArray(); var da = new BvhDataAdapter(scene); var treeData = da.GetMpData(); bvh = treeData; trianglesCount = ti.Length; tris = ti; nodesCount = treeData.Length; Tracer.TraceLine("BVH Data Size {0:F3} MBytes", (treeData.Length * 32f) / (1024f * 1024f)); }
public GPU_Functionality(int deviceID = 0) { ctx = new CudaContext(deviceID); version = ctx.GetDeviceComputeCapability(); Trace.WriteLine($"cuda compute capability {version.Major}.{version.Minor}"); CUmodule collision_module = ctx.LoadModulePTX("collision_kernels.ptx"); kNarrowPhase = new CudaKernel("kNarrowPhase_new", collision_module, ctx); kFindClosestFace = new CudaKernel("kFindClosestFace", collision_module, ctx); kCollisionResponseForce = new CudaKernel("kCollisionResponseForce", collision_module, ctx); dim3 block = new dim3(block_size, 1, 1); kNarrowPhase.BlockDimensions = block; kFindClosestFace.BlockDimensions = block; kCollisionResponseForce.BlockDimensions = block; // cz CUmodule module_cz_kernels = ctx.LoadModulePTX("cz_kernels.ptx"); kczCZForce = new CudaKernel("kczCZForce", module_cz_kernels, ctx); kczCZForce.BlockDimensions = block; // elem CUmodule module_elem_kernels = ctx.LoadModulePTX("elem_kernels.ptx"); kelElementElasticityForce = new CudaKernel("kelElementElasticityForce", module_elem_kernels, ctx); kelElementElasticityForce.BlockDimensions = block; }
internal ManagedCuda.NVRTC.nvrtcResult LoadKernel(out string log) { string path = "MyKernels.c"; ManagedCuda.NVRTC.nvrtcResult result; using (var rtc = new ManagedCuda.NVRTC.CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path))) { try { rtc.Compile(new string[0]); // see http://docs.nvidia.com/cuda/nvrtc/index.html for usage and options result = ManagedCuda.NVRTC.nvrtcResult.Success; } catch (ManagedCuda.NVRTC.NVRTCException ex) { result = ex.NVRTCError; } log = rtc.GetLogAsString(); if (result == ManagedCuda.NVRTC.nvrtcResult.Success) { byte[] ptx = rtc.GetPTX(); multiply = ctx.LoadKernelFatBin(ptx, "Multiply"); // hard-coded method name from the CUDA kernel } } return(result); }
private void RunKernel(CudaKernel kernel, int count, IEnumerable <object> parameters) { // configure the dimensions; note, usually this is a lot more dynamic based // on input data, but we'll still go through the motions int threadsPerBlock, blockCount; if (count <= this._context.DefaultThreadsPerBlock) // a single block { blockCount = 1; threadsPerBlock = RoundUp(count, this._context.WarpSize); // slight caveat here; if you are using "shuffle" operations, you // need to use entire "warp"s - otherwise the result is undefined } else if (count <= this._context.DefaultThreadsPerBlock * this._context.DefaultBlockCount) { // more than enough work to keep us busy; just use that threadsPerBlock = this._context.DefaultThreadsPerBlock; blockCount = this._context.DefaultBlockCount; } else { // do the math to figure out how many blocks we need threadsPerBlock = this._context.DefaultThreadsPerBlock; blockCount = (count + threadsPerBlock - 1) / threadsPerBlock; } // we're using 1-D math, but actually CUDA supports blocks and grids that span 3 dimensions kernel.BlockDimensions = new dim3(threadsPerBlock, 1, 1); kernel.GridDimensions = new dim3(blockCount, 1, 1); // invoke the kernel var withCount = parameters.ToList(); withCount.Insert(0, count); kernel.RunAsync(this._context.DefaultStream.Stream, withCount.ToArray()); }
public CudaError LaunchKernelWithStreamBinding( CudaStream stream, CudaKernel kernel, int gridDimX, int gridDimY, int gridDimZ, int blockDimX, int blockDimY, int blockDimZ, int sharedMemSizeInBytes, IntPtr args, IntPtr kernelArgs) { var binding = stream.BindScoped(); var result = LaunchKernel( kernel.FunctionPtr, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ, sharedMemSizeInBytes, stream.StreamPtr, args, kernelArgs); binding.Recover(); return(result); }
public static void init(int maxCnt) { _gpuVelocity = KernelLoader.load_kernel("update_velocities"); _gpuUpdate = KernelLoader.load_kernel("update_particles"); _dt = ISF.properties.dt; torus_d = new float[3] { ISF.properties.dx, ISF.properties.dy, ISF.properties.dz }; torus_res = new int[3] { ISF.properties.resx, ISF.properties.resy, ISF.properties.resz }; torus_size = new int[3] { ISF.properties.sizex, ISF.properties.sizey, ISF.properties.sizez }; d_k1x = new CudaDeviceVariable <float>(maxCnt); d_k1y = new CudaDeviceVariable <float>(maxCnt); d_k1z = new CudaDeviceVariable <float>(maxCnt); d_k2x = new CudaDeviceVariable <float>(maxCnt); d_k2y = new CudaDeviceVariable <float>(maxCnt); d_k2z = new CudaDeviceVariable <float>(maxCnt); d_k3x = new CudaDeviceVariable <float>(maxCnt); d_k3y = new CudaDeviceVariable <float>(maxCnt); d_k3z = new CudaDeviceVariable <float>(maxCnt); d_k4x = new CudaDeviceVariable <float>(maxCnt); d_k4y = new CudaDeviceVariable <float>(maxCnt); d_k4z = new CudaDeviceVariable <float>(maxCnt); }
internal nvrtcResult LoadKernel(out string log) { nvrtcResult result; using (var rtc = new CudaRuntimeCompiler(File.ReadAllText(path), Path.GetFileName(path))) { try { rtc.Compile(Array.Empty <string>()); result = nvrtcResult.Success; } catch (NVRTCException ex) { result = ex.NVRTCError; } log = rtc.GetLogAsString(); if (result == nvrtcResult.Success) { byte[] ptx = rtc.GetPTX(); multiply = ctx.LoadKernelFatBin(ptx, methodName); } } return(result); }
// Testing managed CUDA call private static void RunCudaWithAKernel() { // C# Cuda code to call kernel int N = 50000; int deviceID = 0; CudaContext ctx = new CudaContext(deviceID); CudaKernel kernel = ctx.LoadKernel("kernel_x64.ptx", "VecAdd"); int numOfThreads = 256; kernel.GridDimensions = (N + numOfThreads - 1) / numOfThreads; kernel.BlockDimensions = numOfThreads; // allocate memory in host (not gpu) var h_A = InitWithData(N, numOfThreads * 4); var h_B = InitWithData(N, numOfThreads); // Allocate vectors in device memory and copy from host to device. CudaDeviceVariable <float> d_A = h_A; CudaDeviceVariable <float> d_B = h_B; CudaDeviceVariable <float> d_C = new CudaDeviceVariable <float>(N); //Invoke kernel kernel.Run(d_A.DevicePointer, d_B.DevicePointer, d_C.DevicePointer, N); Console.WriteLine("kernel has runeth"); //Copy from memory of device to host. float[] h_C = d_C; }
public CudaKernel GetOrCreateCudaKernel(string moduleName, string kernelName) { CudaKernel kernel; moduleName = "MasterInclude"; if (!_kernels.TryGetValue(kernelName, out kernel)) { CUmodule module; if (!_modules.TryGetValue(moduleName, out module)) { string fatbinName = ""; if (IntPtr.Size == 8) { fatbinName = moduleName + ".x64.fatbin"; } else { fatbinName = moduleName + ".fatbin"; } using (Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(fatbinName)) { if (stream == null) { throw new Exception($"Fatbin embedded resource '{fatbinName}' could not be found"); } module = Context.LoadModuleFatBin(stream); _modules[moduleName] = module; } } kernel = new CudaKernel(kernelName, module, Context); _kernels[kernelName] = kernel; } return(kernel); }
private void RunKernel(Volume <T> input, Volume <T> output, CudaKernel kernel) { if (!Equals(input.Shape, output.Shape)) { throw new ArgumentException($"{nameof(input)} and {nameof(output)} should have the same shape."); } var inputStorage = input.Storage as IVolumeStorage <T>; if (inputStorage == null) { throw new ArgumentException($"{nameof(input)} storage should be VolumeStorage", nameof(input)); } var outputStorage = output.Storage as IVolumeStorage <T>; if (outputStorage == null) { throw new ArgumentException($"{nameof(output)} storage should be VolumeStorage", nameof(output)); } inputStorage.CopyToDevice(); outputStorage.CopyToDevice(); var count = (int)input.Shape.TotalLength; var parameters = new object[] { inputStorage.DeviceBuffer.DevicePointer, outputStorage.DeviceBuffer.DevicePointer }; RunKernel(kernel, count, parameters); }
public void CUDA_AddFloatArrays() { //Load Kernel image from resources Stream stream = new StreamReader(resName).BaseStream; if (stream == null) { throw new ArgumentException("Kernel not found in resources."); } vectorAddKernel = ctx.LoadKernelPTX(stream, "VecAdd"); var threadsPerBlock = 1024; vectorAddKernel.BlockDimensions = threadsPerBlock; vectorAddKernel.GridDimensions = (Count + threadsPerBlock - 1) / threadsPerBlock; CudaStopWatch w = new CudaStopWatch(); w.Start(); vectorAddKernel.Run(d_A.DevicePointer, d_B.DevicePointer, C.DevicePointer, Count); w.Stop(); Debug.Log(w.GetElapsedTime() / 1000.0f); Debug.Log($"{h_A[0]} + {h_B[0]} = {C[0]}"); Debug.Log($"{h_A[Count-1]} + {h_B[Count-1]} = {C[Count-1]}"); // Copy result from device memory to host memory // h_C contains the result in host memory // h_C = d_C; }
public CudaKernel GetKernel(string name, bool isStrongName = false) { if (!isStrongName) name = GetStrongName(name); var kernel = new CudaKernel(name, Module, Context); return kernel; }
static void InitKernels() { //max thread number - 65534x256=16776704 _matrixSize = 256; _threadsPerBlock = 256; CleanUpResources(); _cnContext = new CudaContext(CudaContext.GetMaxGflopsDeviceId()); CUmodule cumodule = _cnContext.LoadModule(@"\Kernel\kernel.ptx"); _multiplyTwoVectorWithCuda = new CudaKernel("_Z6kernel_", cumodule, _cnContext); }
/// <summary> /// Creates a new 2D texture from array memory. Allocates a new 2D array. /// </summary> /// <param name="kernel"></param> /// <param name="texName"></param> /// <param name="addressMode0"></param> /// <param name="addressMode1"></param> /// <param name="filterMode"></param> /// <param name="flags"></param> /// <param name="format"></param> /// <param name="height">In elements</param> /// <param name="width">In elements</param> /// <param name="numChannels">1,2 or 4</param> public CudaTextureArray2D(CudaKernel kernel, string texName, CUAddressMode addressMode0, CUAddressMode addressMode1, CUFilterMode filterMode, CUTexRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, CudaArray2DNumChannels numChannels) { _texref = new CUtexref(); res = DriverAPINativeMethods.ModuleManagement.cuModuleGetTexRef(ref _texref, kernel.CUModule, texName); Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Texture name: {3}", DateTime.Now, "cuModuleGetTexRef", res, texName)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 0, addressMode0); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 1, addressMode1); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFilterMode(_texref, filterMode); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFilterMode", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFlags(_texref, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFlags", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFormat(_texref, format, (int)numChannels); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFormat", res)); if (res != CUResult.Success) throw new CudaException(res); _filtermode = filterMode; _flags = flags; _addressMode0 = addressMode0; _addressMode1 = addressMode1; _format = format; _height = height; _width = width; _numChannels = (int)numChannels; _name = texName; _module = kernel.CUModule; _cufunction = kernel.CUFunction; _channelSize = CudaHelperMethods.GetChannelSize(format); _dataSize = height * width * _numChannels * _channelSize; _array = new CudaArray2D(format, width, height, numChannels); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetArray(_texref, _array.CUArray, CUTexRefSetArrayFlags.OverrideFormat); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetArray", res)); if (res != CUResult.Success) throw new CudaException(res); //res = DriverAPINativeMethods.ParameterManagement.cuParamSetTexRef(kernel.CUFunction, CUParameterTexRef.Default, _texref); //Debug.WriteLine("{0:G}, {1}: {2}", DateTime.Now, "cuParamSetTexRef", res); //if (res != CUResult.Success) throw new CudaException(res); }
public DadeCudaIntersectionDevice(RayEngineScene scene, NVContext ctx) : base(scene) { this.scene = scene; wallclock = new Stopwatch(); this.todoRayBuffers = new InputRayBufferCollection(); this.doneRayBuffers = new OutputRayBufferCollection(); this.started = false; if (ctx != null) { this.cudaContext = ctx; } else { this.cudaContext = new NVContext() { Context = new CudaContext(CudaContext.GetMaxGflopsDeviceId()) }; } using (var sr = new StreamReader(@"G:\Git\RayDen\CudaMegaRay\x64\Release\Intersection.cu.ptx")) { intersectKernel = cudaContext.Context.LoadKernelPTX(sr.BaseStream, "Intersect"); } this.rays = new CudaDeviceVariable<RayData>(RayBuffer.RayBufferSize); this.hits = new CudaDeviceVariable<RayHit>(RayBuffer.RayBufferSize); verts = scene.Vertices.ToArray(); tris=scene.Triangles.Select(i => i.GetInfo()).ToArray(); if (GlobalConfiguration.Instance.UseSceneCaching && scene.Cache != null) { bvh = scene.Cache.BvhData; nodesCount = scene.Cache.BvhData.Length; } else { var da = new BvhDataAdapter(scene); var treeData = da.BuildData(); bvh = treeData; nodesCount = treeData.Length; } Tracer.TraceLine("BVH Data Size {0:F3} MBytes", (nodesCount * 32f) / (1024f * 1024f)); }
/// <summary> /// Creates a new 2D texture from array memory. Allocates a new 2D array. /// </summary> /// <param name="kernel"></param> /// <param name="texName"></param> /// <param name="addressMode"></param> /// <param name="filterMode"></param> /// <param name="flags"></param> /// <param name="format"></param> /// <param name="height">In elements</param> /// <param name="width">In elements</param> /// <param name="numChannels">1,2 or 4</param> public CudaTextureArray2D(CudaKernel kernel, string texName, CUAddressMode addressMode, CUFilterMode filterMode, CUTexRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, CudaArray2DNumChannels numChannels) : this(kernel, texName, addressMode, addressMode, filterMode, flags, format, width, height, numChannels) { }
private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth) { int count = width * height * depth; int widthD = width - 1; int heightD = height - 1; int depthD = depth - 1; int countDecremented = widthD * heightD * depthD; dim3 blockDimensions = new dim3(8, 8, 8); dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0)); dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0)); CUDANoiseCube noiseCube = new CUDANoiseCube(); CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16); CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray); CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count); kernelPositionWeight.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions); kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth); kernelNormalAmbient.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions); kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount); int nearestW = NearestPowerOfTwo(widthD); int nearestH = NearestPowerOfTwo(heightD); int nearestD = NearestPowerOfTwo(depthD); int nearestCount = nearestW * nearestH * nearestD; CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount); trisCountDevice.Memset(0); CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented); kernelMarchingCubesCases.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented); kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD); CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount); int lastTrisCount = 0; trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int)); int lastPrefixSum = 0; prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int)); int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3; if (totalVerticesCount > 0) { if (container.Geometry != null) container.Geometry.Dispose(); container.VertexCount = totalVerticesCount; container.Geometry = new Buffer(graphicsDevice, new BufferDescription() { BindFlags = BindFlags.VertexBuffer, CpuAccessFlags = CpuAccessFlags.None, OptionFlags = ResourceOptionFlags.None, SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount, Usage = ResourceUsage.Default }); CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None); kernelMarchingCubesVertices.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented); directResource.Map(); kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD); directResource.UnMap(); directResource.Dispose(); } else { container.VertexCount = 0; if (container.Geometry != null) container.Geometry.Dispose(); } noiseCube.Dispose(); prefixSumsDev.Dispose(); trisCountDevice.Dispose(); offsetsDev.Dispose(); noiseArray.Dispose(); noiseTexture.Dispose(); voxelsDev.Dispose(); }
/// <summary> /// Creates a new mipmapped texture from array memory. Allocates a new mipmapped array. /// </summary> /// <param name="kernel"></param> /// <param name="texName"></param> /// <param name="addressMode0"></param> /// <param name="addressMode1"></param> /// <param name="addressMode2"></param> /// <param name="filterMode"></param> /// <param name="flags"></param> /// <param name="descriptor"></param> /// <param name="numMipmapLevels"></param> /// <param name="maxAniso"></param> /// <param name="mipmapFilterMode"></param> /// <param name="mipmapLevelBias"></param> /// <param name="minMipmapLevelClamp"></param> /// <param name="maxMipmapLevelClamp"></param> public CudaTextureMipmappedArray(CudaKernel kernel, string texName, CUAddressMode addressMode0, CUAddressMode addressMode1, CUAddressMode addressMode2, CUFilterMode filterMode, CUTexRefSetFlags flags, CUDAArray3DDescriptor descriptor, uint numMipmapLevels, uint maxAniso, CUFilterMode mipmapFilterMode, float mipmapLevelBias, float minMipmapLevelClamp, float maxMipmapLevelClamp) { _maxAniso = maxAniso; _mipmapFilterMode = mipmapFilterMode; _mipmapLevelBias = mipmapLevelBias; _minMipmapLevelClamp = minMipmapLevelClamp; _maxMipmapLevelClamp = maxMipmapLevelClamp; _texref = new CUtexref(); res = DriverAPINativeMethods.ModuleManagement.cuModuleGetTexRef(ref _texref, kernel.CUModule, texName); Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Texture name: {3}", DateTime.Now, "cuModuleGetTexRef", res, texName)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 0, addressMode0); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 1, addressMode1); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetAddressMode(_texref, 2, addressMode2); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetAddressMode", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFilterMode(_texref, filterMode); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFilterMode", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFlags(_texref, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFlags", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetFormat(_texref, descriptor.Format, (int)descriptor.NumChannels); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetFormat", res)); if (res != CUResult.Success) throw new CudaException(res); _filtermode = filterMode; _flags = flags; _addressMode0 = addressMode0; _addressMode1 = addressMode1; _addressMode2 = addressMode2; _arrayDescriptor = descriptor; _name = texName; _module = kernel.CUModule; _cufunction = kernel.CUFunction; _array = new CudaMipmappedArray(descriptor, numMipmapLevels); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmappedArray(_texref, _array.CUMipmappedArray, CUTexRefSetArrayFlags.OverrideFormat); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmappedArray", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMaxAnisotropy(_texref, maxAniso); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMaxAnisotropy", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapFilterMode(_texref, mipmapFilterMode); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapFilterMode", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapLevelBias(_texref, mipmapLevelBias); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapLevelBias", res)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.TextureReferenceManagement.cuTexRefSetMipmapLevelClamp(_texref, minMipmapLevelClamp, maxMipmapLevelClamp); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuTexRefSetMipmapLevelClamp", res)); if (res != CUResult.Success) throw new CudaException(res); }
private void button1_Click(object sender, EventArgs e) { triangulation = cuda.LoadPTX("Triangulation", "PTX", "Triangulation"); merge_vertical = cuda.LoadPTX("MergeVertical", "PTX", "merge"); regionSplitH = cuda.LoadPTX("RegionSplit", "PTX", "splitRegionH"); regionSplitV_Phase1 = cuda.LoadPTX("RegionSplit", "PTX", "splitRegionV_phase1"); regionSplitV_Phase2 = cuda.LoadPTX("RegionSplit", "PTX", "splitRegionV_phase2"); // add a random points TODO: add external source (ex. file) CreateRandomPoints(1024 * 8, new FxVector2f(0, 0), new FxVector2f(5000, 5000)); #region Set the max face/he/ve/boundary NumVertex = listAllVertex.Count; // select the spliting numbers // find the split points NumRegions = (int)Math.Ceiling((float)NumVertex / (float)maxVertexPerRegion); HorizontalRegions = (int)Math.Floor(Math.Sqrt(NumRegions)); VerticalRegions = (int)Math.Floor((float)NumRegions / (float)HorizontalRegions); NumRegions = HorizontalRegions * VerticalRegions; // init the array sizes // max faces per thread maxFacesPerThread = maxVertexPerRegion * 5; maxFacesPerThread += maxFacesPerThread % 32; // max Half edge per thread maxHalfEdgePerThread = maxFacesPerThread * 5; maxHalfEdgePerThread += maxHalfEdgePerThread % 32; // max vertex per thread maxBoundaryNodesPerThread = maxVertexPerRegion * 5; maxBoundaryNodesPerThread += maxBoundaryNodesPerThread % 32; WriteLine("maxFacesPerThread:" + maxFacesPerThread.ToString()); WriteLine("maxHalfEdgePerThread:" + maxHalfEdgePerThread.ToString()); WriteLine("maxBoundaryNodesPerThread:" + maxBoundaryNodesPerThread.ToString()); #endregion // init the array on cpu side threadInfo = new csThreadInfo[NumRegions]; regionInfo = new RegionInfo[NumRegions]; threadParam = new cbThreadParam(); #region init the thread param // init the thread param threadParam.maxFacesPerThread = (uint)maxFacesPerThread; threadParam.maxHalfEdgePerThread = (uint)maxHalfEdgePerThread; threadParam.maxBoundaryNodesPerThread = (uint)maxBoundaryNodesPerThread; threadParam.RegionsNum = (uint)NumRegions; MV_threadParam.ThreadNumPerRow = (uint)(VerticalRegions-1); MV_threadParam.HorizontalThreadNum = (uint)(HorizontalRegions); MV_threadParam.ThreadNum = MV_threadParam.HorizontalThreadNum * MV_threadParam.ThreadNumPerRow; MV_threadParam.stackMaxSize = stackMaxSize; MV_threadParam.depth = 0; #endregion // copy the data to the hardware d_threadInfo = threadInfo; d_regionInfo = regionInfo; d_threadParam = threadParam; d_FaceList = new CudaDeviceVariable<csFace>(maxFacesPerThread * NumRegions); d_BoundaryList = new CudaDeviceVariable<csBoundaryNode>(maxBoundaryNodesPerThread * NumRegions); d_HalfEdgeList = new CudaDeviceVariable<csHalfEdge>(maxHalfEdgePerThread * NumRegions); d_Stack = new CudaDeviceVariable<csStack>(stackMaxSize * NumRegions); d_UintStack = new CudaDeviceVariable<uint>(2 * stackMaxSize * NumRegions); // Update the region info by sort the vertex // try to sort the list GPUSort = new BitonicSort<FxVector2f>(cuda); }
private CudaKernel InitializeGridsAndThreads(string kernalName, Matrix a) { int maxThreads = Math.Min(this.maxThreadPerBlockDim, a.Row); dim3 threads = a.Column == 1 ? new dim3(maxThreads, 1) : new dim3(maxThreads, maxThreads); dim3 blocks = new dim3((a.Row + maxThreads - 1) / maxThreads, (a.Column + maxThreads - 1) / maxThreads); CudaKernel kernel = new CudaKernel(kernalName, this.cuModule, this.cudaContext) { GridDimensions = blocks, BlockDimensions = threads }; return kernel; }
private void InitializeCUDA() { context = new CudaContext(CudaContext.GetMaxGflopsDevice(), graphicsDevice.ComPointer, CUCtxFlags.SchedAuto, CudaContext.DirectXVersion.D3D11); module = context.LoadModulePTX(@"Kernels\kernel.ptx"); kernelPositionWeightNoiseCube = new CudaKernel("position_weight_noise_cube", module, context); kernelNormalAmbient = new CudaKernel("normal_ambient", module, context); kernelMarchingCubesCases = new CudaKernel("marching_cubes_cases", module, context); kernelMarchingCubesVertices = new CudaKernel("marching_cubes_vertices", module, context); kernelPositionWeightNoiseCubeWarp = new CudaKernel("position_weight_noise_cube_warp", module, context); kernelPositionWeightFormula = new CudaKernel("position_weight_formula", module, context); prefixScan = new CUDAPrefixScan(module, context); }
protected void InitContext() { var size = ParticlesCount * DimensionsCount; var threadsNum = 32; var blocksNum = ParticlesCount / threadsNum; Ctx = new CudaContext(0); UpdateVelocity = Ctx.LoadKernel("update_velocity_kernel.ptx", "updateVelocityKernel"); UpdateVelocity.GridDimensions = blocksNum; UpdateVelocity.BlockDimensions = threadsNum; Transpose = Ctx.LoadKernel(KernelFile, "transposeKernel"); Transpose.GridDimensions = blocksNum; Transpose.BlockDimensions = threadsNum; HostPositions = Random.RandomVector(size, -5.0, 5.0); HostVelocities = Random.RandomVector(size, -2.0, 2.0); HostPersonalBests = (double[]) HostPositions.Clone(); HostPersonalBestValues = Enumerable.Repeat(double.MaxValue,ParticlesCount).ToArray(); HostNeighbors = new int[ParticlesCount * 2]; for (var i = 0; i < ParticlesCount*2; i += 2) { int left, right; if (i == 0) left = ParticlesCount - 1; else left = i - 1; if (i == ParticlesCount - 1) right = 0; else right = i + 1; HostNeighbors[i] = left; HostNeighbors[i + 1] = right; } DevicePositions = HostPositions; DeviceVelocities = HostVelocities; DevicePersonalBests = HostPersonalBests; DevicePersonalBestValues = HostPersonalBestValues; DeviceNeighbors = HostNeighbors; Init(); }
/// <summary> /// Creates a new surface from array memory. /// </summary> /// <param name="kernel"></param> /// <param name="surfName"></param> /// <param name="flags"></param> /// <param name="array"></param> public CudaSurface(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CudaArray3D array) { _surfref = new CUsurfref(); res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref _surfref, kernel.CUModule, surfName); Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName)); if (res != CUResult.Success) throw new CudaException(res); _flags = flags; _format = array.Array3DDescriptor.Format; _height = array.Height; _width = array.Width; _depth = array.Depth; _numChannels = (int)array.Array3DDescriptor.NumChannels; _name = surfName; _module = kernel.CUModule; _cufunction = kernel.CUFunction; _channelSize = CudaHelperMethods.GetChannelSize(array.Array3DDescriptor.Format); _dataSize = array.Height * array.Width * array.Depth * array.Array3DDescriptor.NumChannels * _channelSize; _array = array; res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(_surfref, _array.CUArray, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res)); if (res != CUResult.Success) throw new CudaException(res); }
/// <summary> /// Creates a new 2D texture from array memory /// </summary> /// <param name="kernel"></param> /// <param name="texName"></param> /// <param name="addressMode"></param> /// <param name="filterMode"></param> /// <param name="flags"></param> /// <param name="array"></param> public CudaTextureArray2D(CudaKernel kernel, string texName, CUAddressMode addressMode, CUFilterMode filterMode, CUTexRefSetFlags flags, CudaArray2D array) : this(kernel, texName, addressMode, addressMode, filterMode, flags, array) { }
static void InitKernels() { CudaContext cntxt = new CudaContext(); CUmodule cumodule = cntxt.LoadModule(@"C:\Users\Niels\Documents\uni ting\P10\P10\programs\small programs\CUDA 1D MA in C Sharp\CUDA 1D MA in C Sharp\Debug\kernel.ptx"); addWithCuda = new CudaKernel("_Z6kerneliiPi", cumodule, cntxt); }
static void Main(string[] args) { // NOTE: You need to change this location to match your own machine. Console.ForegroundColor = ConsoleColor.Red; Console.WriteLine("NOTE: You must change the kernel location before running this project so it matches your own environment."); Console.ResetColor(); System.Threading.Thread.Sleep(500); string path = @"X:\MachineLearning\CUDAGraph-2\CUDAGraph_Kernel\Debug\kernel.cu.ptx"; CudaContext ctx = new CudaContext(); CUmodule module = ctx.LoadModule(path); kernel = new CudaKernel("kernel", module, ctx); // This tells the kernel to allocate a lot of threads for the Gpu. kernel.BlockDimensions = THREADS_PER_BLOCK; kernel.GridDimensions = VECTOR_SIZE / THREADS_PER_BLOCK + 1; ; // Now let's load the kernel! // Create the topology. int[] topology = new int[] { 1, 200, 200, 100, 1 }; int height = topology.Length; int width = 0; for (int i = 0; i < topology.Length; i++) if (width < topology[i]) width = topology[i]; // Launch! float[] res = new float[height * width]; for (int i = 0; i < 10; i++) { float[] matrix = new float[height * width]; float[] weights = new float[height * width]; Random rand = new Random(424242); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x++) { matrix[y * width + x] = (y == 0 && x < topology[y]) ? 1.0f : 0; weights[y * width + x] = (x < topology[y]) ? (float)(rand.NextDouble() - rand.NextDouble()) : 0; } } // Load the kernel with some variables. CudaDeviceVariable<int> cuda_topology = topology; CudaDeviceVariable<float> cuda_membank = matrix; CudaDeviceVariable<float> cuda_weights = weights; Stopwatch sw = new Stopwatch(); sw.Start(); kernel.Run(cuda_topology.DevicePointer, cuda_membank.DevicePointer, cuda_weights.DevicePointer, height, width); cuda_membank.CopyToHost(res); sw.Stop(); Console.ForegroundColor = ConsoleColor.Green; Console.WriteLine("{0} ticks to compute -> {1}", sw.ElapsedTicks, res[0]); Console.ResetColor(); } Console.ReadKey(); }
/// <summary> /// Bind a CudaArray3D to a surface reference. /// </summary> /// <param name="kernel"></param> /// <param name="surfName"></param> /// <param name="flags"></param> /// <param name="array"></param> public static void BindArray(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CudaArray3D array) { CUsurfref surfref = new CUsurfref(); CUResult res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref surfref, kernel.CUModule, surfName); Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName)); if (res != CUResult.Success) throw new CudaException(res); res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(surfref, array.CUArray, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res)); if (res != CUResult.Success) throw new CudaException(res); }
private void InitializeD3D() { // Create the D3D object. d3d = new Direct3DEx(); PresentParameters pp = new PresentParameters(); pp.BackBufferWidth = 512; pp.BackBufferHeight = 512; pp.BackBufferFormat = Format.Unknown; pp.BackBufferCount = 0; pp.Multisample = MultisampleType.None; pp.MultisampleQuality = 0; pp.SwapEffect = SwapEffect.Discard; pp.DeviceWindowHandle = panel1.Handle; pp.Windowed = true; pp.EnableAutoDepthStencil = false; pp.AutoDepthStencilFormat = Format.Unknown; pp.PresentationInterval = PresentInterval.Default; bDeviceFound = false; CUdevice[] cudaDevices = null; for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++) { device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, panel1.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp); try { cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9); bDeviceFound = cudaDevices.Length > 0; Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA."); break; } catch (CudaException) { //No Cuda device found for this Direct3D9 device Console.WriteLine("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA."); } } // we check to make sure we have found a cuda-compatible D3D device to work on if (!bDeviceFound) { Console.WriteLine("No CUDA-compatible Direct3D9 device available"); if (device != null) device.Dispose(); Close(); return; } ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9); // Set projection matrix SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1); device.SetTransform(TransformState.Projection, matProj); // Turn off D3D lighting, since we are providing our own vertex colors device.SetRenderState(RenderState.Lighting, false); //Load kernels CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_k", module, ctx); }
private bool InitializeD3D() { HwndSource hwnd = new HwndSource(0, 0, 0, 0, 0, "null", IntPtr.Zero); // Create the D3D object. d3d = new Direct3DEx(); PresentParameters pp = new PresentParameters(); pp.BackBufferWidth = 512; pp.BackBufferHeight = 512; pp.BackBufferFormat = Format.Unknown; pp.BackBufferCount = 0; pp.Multisample = MultisampleType.None; pp.MultisampleQuality = 0; pp.SwapEffect = SwapEffect.Discard; pp.DeviceWindowHandle = (IntPtr)0; pp.Windowed = true; pp.EnableAutoDepthStencil = false; pp.AutoDepthStencilFormat = Format.Unknown; pp.PresentationInterval = PresentInterval.Default; bDeviceFound = false; CUdevice[] cudaDevices = null; for (g_iAdapter = 0; g_iAdapter < d3d.AdapterCount; g_iAdapter++) { device = new DeviceEx(d3d, d3d.Adapters[g_iAdapter].Adapter, DeviceType.Hardware, hwnd.Handle, CreateFlags.HardwareVertexProcessing | CreateFlags.Multithreaded, pp); try { cudaDevices = CudaContext.GetDirectXDevices(device.ComPointer, CUd3dXDeviceList.All, CudaContext.DirectXVersion.D3D9); bDeviceFound = cudaDevices.Length > 0; infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 and CUDA.\n"); break; } catch (CudaException) { //No Cuda device found for this Direct3D9 device infoLog.AppendText("> Display Device #" + d3d.Adapters[g_iAdapter].Adapter + ": \"" + d3d.Adapters[g_iAdapter].Details.Description + "\" supports Direct3D9 but not CUDA.\n"); } } // we check to make sure we have found a cuda-compatible D3D device to work on if (!bDeviceFound) { infoLog.AppendText("No CUDA-compatible Direct3D9 device available"); if (device != null) device.Dispose(); return false; } ctx = new CudaContext(cudaDevices[0], device.ComPointer, CUCtxFlags.BlockingSync, CudaContext.DirectXVersion.D3D9); deviceName.Text = "Device name: " + ctx.GetDeviceName(); // Set projection matrix SlimDX.Matrix matProj = SlimDX.Matrix.OrthoOffCenterLH(0, 1, 1, 0, 0, 1); device.SetTransform(TransformState.Projection, matProj); // Turn off D3D lighting, since we are providing our own vertex colors device.SetRenderState(RenderState.Lighting, false); //Load kernels CUmodule module = ctx.LoadModulePTX("kernel.ptx"); addForces_k = new CudaKernel("addForces_k", module, ctx); advectVelocity_k = new CudaKernel("advectVelocity_k", module, ctx); diffuseProject_k = new CudaKernel("diffuseProject_k", module, ctx); updateVelocity_k = new CudaKernel("updateVelocity_k", module, ctx); advectParticles_k = new CudaKernel("advectParticles_k", module, ctx); d3dimage.Lock(); Surface surf = device.GetBackBuffer(0, 0); d3dimage.SetBackBuffer(D3DResourceType.IDirect3DSurface9, surf.ComPointer); d3dimage.Unlock(); surf.Dispose(); //Setup the "real" frame rate counter. //The cuda counter only measures cuda runtime, not the overhead to actually //show the result via DirectX and WPF. realLastTick = Environment.TickCount; return true; }
public void Compile() { using (var ctx = new CudaContext()) { // with verbaim string @, we only have to double up double quotes: no other escaping string source = @" extern ""C"" __global__ void saxpy(float a, float *x, float *y, float *out, size_t n) { size_t tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < n) { out[tid] = a * x[tid] + y[tid]; } } "; source += Environment.NewLine; var name = "Test"; var headers = new string[0]; var includeNames = new string[0]; var compiler = new CudaRuntimeCompiler(source, name, headers, includeNames); //var compiler2 = new CudaRuntimeCompiler(source, name, headers, includeNames); // --ptxas-options=-v -keep compiler.Compile(new string[] { "-G" }); //var ptxString = compiler.GetPTXAsString(); // for debugging var ptx = compiler.GetPTX(); //compiler2.Compile(new string[] { }); var kernel = ctx.LoadKernelPTX(ptx, "kernelName"); //One kernel per cu file: //CudaKernel kernel = ctx.LoadKernel(@"path\to\kernel.ptx", "kernelname"); kernel.GridDimensions = new dim3(1, 1, 1); kernel.BlockDimensions = new dim3(16, 16); //kernel.Run() var a = new CudaDeviceVariable<double>(100); //ManagedCuda.NPP.NPPsExtensions.NPPsExtensionMethods.Sqr() //Multiple kernels per cu file: CUmodule cumodule = ctx.LoadModule(@"path\to\kernel.ptx"); CudaKernel kernel1 = new CudaKernel("kernel1", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; CudaKernel kernel2 = new CudaKernel("kernel2", cumodule, ctx) { GridDimensions = new dim3(1, 1, 1), BlockDimensions = new dim3(16, 16), }; } }
/// <summary> /// Create a new CudaArray3D and bind it to a surface reference. /// </summary> /// <param name="kernel"></param> /// <param name="surfName"></param> /// <param name="flags"></param> /// <param name="format"></param> /// <param name="width">In elements</param> /// <param name="height">In elements</param> /// <param name="depth">In elements</param> /// <param name="numChannels"></param> /// <param name="arrayFlags"></param> public static CudaArray3D BindArray(CudaKernel kernel, string surfName, CUSurfRefSetFlags flags, CUArrayFormat format, SizeT width, SizeT height, SizeT depth, CudaArray3DNumChannels numChannels, CUDAArray3DFlags arrayFlags) { CUsurfref surfref = new CUsurfref(); CUResult res = DriverAPINativeMethods.ModuleManagement.cuModuleGetSurfRef(ref surfref, kernel.CUModule, surfName); Debug.WriteLine(String.Format("{0:G}, {1}: {2}, Surface name: {3}", DateTime.Now, "cuModuleGetSurfRef", res, surfName)); if (res != CUResult.Success) throw new CudaException(res); CudaArray3D array = new CudaArray3D(format, width, height, depth, numChannels, arrayFlags); res = DriverAPINativeMethods.SurfaceReferenceManagement.cuSurfRefSetArray(surfref, array.CUArray, flags); Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuSurfRefSetArray", res)); if (res != CUResult.Success) throw new CudaException(res); return array; }
/// <summary> /// /// </summary> /// <param name="aKernel"></param> public cudaOccFuncAttributes(CudaKernel aKernel) : this(aKernel.MaxThreadsPerBlock, aKernel.Registers, aKernel.SharedMemory) { }
/////////////////////////////////////////////// // Occupancy calculation Functions // /////////////////////////////////////////////// /// <summary> /// Determine the maximum number of CTAs that can be run simultaneously per SM.<para/> /// This is equivalent to the calculation done in the CUDA Occupancy Calculator /// spreadsheet /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <returns></returns> public static cudaOccResult cudaOccMaxActiveBlocksPerMultiprocessor( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return cudaOccMaxActiveBlocksPerMultiprocessor(props, attributes, (int)kernel.BlockDimensions.x * (int)kernel.BlockDimensions.y * (int)kernel.BlockDimensions.z, kernel.DynamicSharedMemory, state); }
/// <summary> /// Determine the potential block size that allows maximum number of CTAs that can run on multiprocessor simultaneously /// </summary> /// <param name="properties"></param> /// <param name="kernel"></param> /// <param name="state"></param> /// <param name="blockSizeToSMem"> /// A function to convert from block size to dynamic shared memory size.<para/> /// e.g.: /// If no dynamic shared memory is used: x => 0<para/> /// If 4 bytes shared memory per thread is used: x = 4 * x</param> /// <returns>maxBlockSize</returns> public static int cudaOccMaxPotentialOccupancyBlockSize( CudaDeviceProperties properties, CudaKernel kernel, cudaOccDeviceState state, del_blockSizeToDynamicSMemSize blockSizeToSMem) { cudaOccDeviceProp props = new cudaOccDeviceProp(properties); cudaOccFuncAttributes attributes = new cudaOccFuncAttributes(kernel); return cudaOccMaxPotentialOccupancyBlockSize(props, attributes, state, blockSizeToSMem); }
static void Main(string[] args) { var assembly = Assembly.GetExecutingAssembly(); var resourceName = "simpleOccupancy.simpleOccupancy.ptx"; ctx = new CudaContext(0); string[] liste = assembly.GetManifestResourceNames(); using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { kernel = ctx.LoadKernelPTX(stream, "square"); } Console.WriteLine("starting Simple Occupancy"); Console.WriteLine(); Console.WriteLine("[ Manual configuration with {0} threads per block ]", manualBlockSize); int status = test(false); if (status != 0) { Console.WriteLine("Test failed"); return; } Console.WriteLine(); Console.WriteLine("[ Automatic, occupancy-based configuration ]"); status = test(true); if (status != 0) { Console.WriteLine("Test failed"); return; } Console.WriteLine(); Console.WriteLine("Test PASSED"); }
/// <summary> /// /// </summary> /// <param name="aKernel"></param> public cudaOccFuncAttributes(CudaKernel aKernel) : this(aKernel.MaxThreadsPerBlock, aKernel.Registers, aKernel.SharedMemory, cudaOccPartitionedGCConfig.Off) { }