private float[,] KernelMul(float[,] a, float[,] b) { const int defaultBlockSize = 16; var blockDim = new dim3(Math.Min(b.Width(), defaultBlockSize), Math.Min(a.Height(), defaultBlockSize)); var gridDim = new dim3((int)Math.Ceiling(1.0 * b.Width() / blockDim.X), (int)Math.Ceiling(1.0 * a.Height() / blockDim.Y)); var ptx = LoadPtxFromResources(); // mess with driver caching JIT results // since we need to verify the entire compilation log ptx = ptx.Replace("exit;", "exit; // " + Guid.NewGuid()); using (var jitted = ptx.JitKernel(blockDim)) { // todo. that's an untidy way to invoke a kernel // since it doesn't dispose of invocation parameters // neither it is shielded against partial initialization issues var c = new float[a.Height(), b.Width()]; var c_result = jitted.Invoke(gridDim, blockDim, a.Width().In(), a.Height().In(), a.In(), b.Width().In(), b.Height().In(), b.In(), c.Width().In(), c.Height().In(), c.Out()); return (float[,])c_result; } }
public KernelResult Launch(dim3 gridDim, dim3 blockDim) { CudaDriver.Ensure(); Args.AssertNone(p => p.IsDisposed); _hasCompletedExecution.AssertFalse(); var offsets = Args.Scanbe(0, (offset, arg, _) => offset + arg.SizeInArgList); Args.Zip(offsets, (arg, offset) => arg.PassInto(this, offset)); try { nvcuda.cuFuncSetBlockShape(Function, blockDim); nvcuda.cuFuncSetSharedSize(Function, (uint)Function.SharedSizeBytes); nvcuda.cuFuncSetCacheConfig(Function, CUfunc_cache.PreferNone); nvcuda.cuParamSetSize(Function, (uint)Args.Select(p => p.SizeInArgList).Sum()); TraceBeforeLaunch(gridDim, blockDim); var wall_time = CudaProfiler.Benchmark(() => nvcuda.cuLaunchGrid(Function, gridDim)); Log.WriteLine("Function execution succeeded in {0} ({1} = 0.5 {2}s).", wall_time, Syms.Epsilon, Syms.Mu); return new KernelResult(this, wall_time); } finally { _hasCompletedExecution = true; } }
//[/cuRANDComputeValue] //[cuRANDPiEstimator] public double RunEstimation(int numSims, int threadBlockSize) { // Aim to launch around ten or more times as many blocks as there // are multiprocessors on the target device. const int blocksPerSm = 10; var numSMs = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT; // Determine how to divide the work between cores var block = new dim3(threadBlockSize); var grid = new dim3((numSims + threadBlockSize - 1) / threadBlockSize); while (grid.x > 2 * blocksPerSm * numSims) grid.x >>= 1; var n = 2 * numSims; using (var dPoints = GPUWorker.Malloc<double>(n)) using (var dResults = GPUWorker.Malloc<double>(grid.x)) { // Generate random points in unit square var curand = new CURAND(GPUWorker, CURANDInterop.curandRngType.CURAND_RNG_QUASI_SOBOL64); curand.SetQuasiRandomGeneratorDimensions(2); curand.SetGeneratorOrdering(CURANDInterop.curandOrdering.CURAND_ORDERING_QUASI_DEFAULT); curand.GenerateUniformDouble(dPoints.Ptr, new IntPtr(n)); var lp = new LaunchParam(grid, block, block.x * sizeof(uint)); GPULaunch(ComputeValue, lp, dResults.Ptr, dPoints.Ptr, numSims); var value = dResults.Gather().Sum(); return (value/numSims)*4.0; } }
internal GPGPUProperties(bool simulate = false, bool useAdvanced = true) { IsSimulated = simulate; Message = string.Empty; UseAdvanced = useAdvanced; MultiProcessorCount = 0; HighPerformanceDriver = false; SupportsDoublePrecision = true; AsynchEngineCount = 1; if (simulate) { Capability = new Version(0, 0); Name = "Simulator"; DeviceId = 0; ulong freeMem = Int32.MaxValue; try { PerformanceCounter pc = new PerformanceCounter("Memory", "Available Bytes"); freeMem = Convert.ToUInt64(pc.NextValue()); } catch (Exception ex) { Debug.WriteLine(ex.Message); #if DEBUG throw; #endif } TotalMemory = freeMem; MaxGridSize = new dim3(65536, 65536); MaxThreadsSize = new dim3(1024, 1024); MaxThreadsPerBlock = 1024; } }
public void Mult(int wA, int wB, int hC, deviceptr<double> A, deviceptr<double> B, deviceptr<double> C) { var block = new dim3(BlockSize, BlockSize); var grid = new dim3(wB/block.x, hC/block.y); var lp = new LaunchParam(grid, block); GPULaunch(Kernel, lp, wA, wB, A, B, C); }
public static JittedKernel JitKernel(this String ptx, dim3 reqntid, HardwareIsa target) { ptx.AssertNotNull(); CudaDriver.Ensure(); var tuning = new JitTuning { Reqntid = reqntid }; return ptx.JitKernel(tuning, target); }
public static KernelResult Run(this JittedFunction function, dim3 gridDim, dim3 blockDim, params KernelArgument[] args) { function.AssertNotNull(); args = args ?? Seq.Empty<KernelArgument>().ToArray(); CudaDriver.Ensure(); return function.Run(gridDim, blockDim, (IEnumerable<KernelArgument>)args); }
public Tuning() { Maxnreg = 0; Maxntid = new dim3(0, 0, 0); Reqntid = new dim3(0, 0, 0); Minnctapersm = 0; Maxnctapersm = 0; }
public static Object Invoke(this JittedKernel kernel, dim3 gridDim, dim3 blockDim, IEnumerable<KernelArgument> args) { kernel.AssertNotNull(); args = args ?? Seq.Empty<KernelArgument>().ToArray(); CudaDriver.Ensure(); return kernel.Function.Invoke(gridDim, blockDim, args); }
public static KernelResult Run(this CUfunction function, dim3 gridDim, dim3 blockDim, IEnumerable<KernelArgument> args) { function.IsNotNull.AssertTrue(); args = args ?? Seq.Empty<KernelArgument>().ToArray(); CudaDriver.Ensure(); return new JittedFunction(function).Run(gridDim, blockDim, args); }
public static KernelResult Run(this JittedKernel kernel, dim3 gridDim, dim3 blockDim, params KernelArgument[] args) { kernel.AssertNotNull(); args = args ?? Seq.Empty<KernelArgument>().ToArray(); CudaDriver.Ensure(); return kernel.Function.Run(gridDim, blockDim, args); }
public static KernelResult Run(this JittedFunction function, dim3 gridDim, dim3 blockDim, IEnumerable<KernelArgument> args) { function.AssertNotNull(); args = args ?? Seq.Empty<KernelArgument>().ToArray(); CudaDriver.Ensure(); var invocation = new KernelInvocation(function, args); return invocation.Launch(gridDim, blockDim); }
public KernelThreadException(IKernel kernel, dim3? gridDim, int3? blockIdx, dim3? blockDim, int3? threadIdx, String workerThread, Exception innerException) : base(null, innerException) { Kernel = kernel.AssertNotNull(); GridDim = gridDim; BlockIdx = blockIdx; BlockDim = blockDim; ThreadIdx = threadIdx; WorkerThread = workerThread.AssertNotNull(); }
public static void TestSimpleMultiply() { for (var iter = 1; iter <= 3; ++iter) { Console.WriteLine("====> Test SimpleMultiply with Alea GPU C# AOT instance usage (#.{0}) <====", iter); var timer = Stopwatch.StartNew(); var worker = Util.Worker; Console.WriteLine("GPU: {0}", worker.Device.Name); timer.Stop(); Console.WriteLine("Step 1) Runtime setup {0} ms", timer.Elapsed.TotalMilliseconds); timer.Restart(); using (var module = new InstanceUsageAOT(GPUModuleTarget.Worker(worker))) { module.GPUForceLoad(); timer.Stop(); Console.WriteLine("Step 2+3) Compile and Load module {0} ms", timer.Elapsed.TotalMilliseconds); const int factor = 8; var a = Util.RandomMatrix(100 * factor, 200 * factor); var b = Util.RandomMatrix(200 * factor, 300 * factor); var aRows = 100 * factor; var bCols = 300 * factor; var aCols_bRows = 200 * factor; var gridDim = new dim3(Util.Divup(bCols, TileSize), Util.Divup(aRows, TileSize)); var blockDim = new dim3(TileSize, TileSize); var lp = new LaunchParam(gridDim, blockDim); using (var devA = worker.Malloc(a)) using (var devB = worker.Malloc(b)) using (var devC = worker.Malloc<float>(aRows * bCols)) { timer.Restart(); module.GPULaunch(module.SimpleMultiplyKernel, lp, devA.Ptr, devB.Ptr, devC.Ptr, aRows, bCols, aCols_bRows); worker.Synchronize(); timer.Stop(); Console.WriteLine("Kernel launch first time {0} ms", timer.Elapsed.TotalMilliseconds); const int repetitions = 50; timer.Restart(); for (var i = 0; i < repetitions; ++i) { module.GPULaunch(module.SimpleMultiplyKernel, lp, devA.Ptr, devB.Ptr, devC.Ptr, aRows, bCols, aCols_bRows); } worker.Synchronize(); timer.Stop(); Console.WriteLine("Kernel launch average time {0} ms", (timer.Elapsed.TotalMilliseconds / (float)repetitions)); var c = devC.Gather(); Util.VerifyResult(a, b, c, aRows, bCols, aCols_bRows); } } } }
internal GridCaps(CudaDevice device) { Device = device; MaxThreadsInBlock = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxThreadsPerBlock, device); var maxBlockX = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxBlockDimX, device); var maxBlockY = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxBlockDimY, device); var maxBlockZ = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxBlockDimZ, device); MaxBlockDim = new dim3(maxBlockX, maxBlockY, maxBlockZ); var maxGridX = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxGridDimX, device); var maxGridY = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxGridDimY, device); var maxGridZ = nvcuda.cuDeviceGetAttribute(CUdevice_attribute.MaxGridDimZ, device); MaxGridDim = new dim3(maxGridX, maxGridY, maxGridZ); }
private void TraceBeforeLaunch(dim3 gridDim, dim3 blockDim) { Log.EnsureBlankLine(); Log.WriteLine("Launching function {0}...", Function); Log.WriteLine("Grid is configured as {2}: blockdim is {0}, griddim is {1}", blockDim.ToString().Slice(4), gridDim.ToString().Slice(4), new dim3(blockDim.X * gridDim.X, blockDim.Y * gridDim.Y, blockDim.Z * gridDim.Z).ToString().Slice(4)); var log = new List<Object>(); var offset = 0; var sizeInVRAM = 0; foreach (var args in Args) { var value = args.Get("_value"); log.Add(Tuple.Create("+" + offset.ToString("0000"), args.Direction, value, args.SizeInVRAM, args.SizeInArgList)); offset += args.SizeInArgList; sizeInVRAM += args.SizeInVRAM; } log.Add(Tuple.Create(offset.ToString("0000"), "", "", sizeInVRAM, offset)); var formatted = log.Select(t => t.TupleItems().Select(v => v.ToString()).ToArray()).ToArray(); Func<int, String, int, String> pad = (i, text, max) => i < 2 ? text.PadRight(max) : i == 2 ? text.PadRight((int)(max + 1)) : text; var padded = formatted.Select(entry => entry.Select((part, i) => pad(i, part, formatted.Max(entry1 => entry1[i].Length)))).ToArray(); var maxLength = 0; padded.ForEach((entry, i) => { if (i == log.Count() - 1 && maxLength != 0) { Log.WriteLine(" " + "*".Repeat(maxLength - 4)); } var line = String.Format(" {0} {1} {2} ({3} bytes in VRAM)", entry.SkipLast(1).ToArray()); Log.WriteLine(line); maxLength = Math.Max(line.Length, maxLength); }); }
void advectParticles(uint vbo, CudaPitchedDeviceVariable<cData> v, int dx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); cuda_vbo_resource.MapAllResources(); CUdeviceptr p = cuda_vbo_resource[0].GetMappedPointer(); advectParticles_k.GridDimensions = grid; advectParticles_k.BlockDimensions = tids; advectParticles_k.Run(p, v.DevicePointer, dx, dy, dt, TILEY / TIDSY, tPitch); cuda_vbo_resource.UnmapAllResources(); }
void advectVelocity(CudaPitchedDeviceVariable<cData> v, CudaDeviceVariable<cData> vx, CudaDeviceVariable<cData> vy, int dx, int pdx, int dy, float dt, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); updateTexture(v, DIM * float2.SizeOf, DIM, tPitch); advectVelocity_k.GridDimensions = grid; advectVelocity_k.BlockDimensions = tids; advectVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, dt, TILEY / TIDSY); }
public void Run(dim3 gridDim, dim3 blockDim, Object kernel_instance) { // todo. due to a bug in Libcuda/nvcuda we cannot specify compilation Target // when we use default target (TargetFromContext) everything works fine tho // so let's stick to this solution for now (Cfg.Target == CudaVersions.HardwareIsa).AssertTrue(); using (var jitted_ptx = Ptx.JitKernel(blockDim)) // using (var jitted_ptx = Ptx.JitKernel(blockDim, Cfg.Target)) { using (var kernel_args = _memcpyHostToDevice(kernel_instance)) { var kernel_result = jitted_ptx.Run(gridDim, blockDim, kernel_args); _memcpyDeviceToHost(kernel_result, kernel_instance); } } }
/// <summary> /// Вызов и исполнение одной элементарной функции по имени функции /// </summary> /// <param name="function"></param> public static void Execute(IEnumerable <string> functions, int input, int output) { Debug.Assert(input != 0 && output != 0); CudafyModule km = CudafyTranslator.Cudafy(); GPGPU gpu = CudafyHost.GetDevice(); gpu.LoadModule(km); int[,] devA = gpu.Allocate(_a); int[,] devB = gpu.Allocate(_b); int[] devC = gpu.Allocate(_c); int[] devD = gpu.Allocate(_d); int[] devE = gpu.Allocate(E); if ((input & (int)Register.A) != 0) { gpu.CopyToDevice(_a, devA); } if ((input & (int)Register.B) != 0) { gpu.CopyToDevice(_b, devB); } if ((input & (int)Register.C) != 0) { gpu.CopyToDevice(_c, devC); } if ((input & (int)Register.D) != 0) { gpu.CopyToDevice(_d, devD); } if ((input & (int)Register.E) != 0) { gpu.CopyToDevice(E, devE); } int rows = _a.GetLength(0); int columns = _a.GetLength(1); dim3 gridSize = Math.Min(15, (int)Math.Pow(rows * columns, 0.33333333333)); dim3 blockSize = Math.Min(15, (int)Math.Pow(rows * columns, 0.33333333333)); foreach (string function in functions) { gpu.Launch(gridSize, blockSize, function, devA, devB, devC, devD, devE); } if ((output & (int)Register.A) != 0) { gpu.CopyFromDevice(devA, _a); } if ((output & (int)Register.B) != 0) { gpu.CopyFromDevice(devB, _b); } if ((output & (int)Register.C) != 0) { gpu.CopyFromDevice(devC, _c); } if ((output & (int)Register.D) != 0) { gpu.CopyFromDevice(devD, _d); } if ((output & (int)Register.E) != 0) { gpu.CopyFromDevice(devE, E); } // free the memory allocated on the GPU gpu.FreeAll(); }
public virtual void SetupExecution(dim3 blockDimensions, dim3 gridDimensions) { m_kernel.BlockDimensions = blockDimensions; m_kernel.GridDimensions = gridDimensions; }
private void Generate(CudaKernel kernelPositionWeight, int width, int height, int depth) { int count = width * height * depth; int widthD = width - 1; int heightD = height - 1; int depthD = depth - 1; int countDecremented = widthD * heightD * depthD; dim3 blockDimensions = new dim3(8, 8, 8); dim3 gridDimensions = new dim3((int)Math.Ceiling(width / 8.0), (int)Math.Ceiling(height / 8.0), (int)Math.Ceiling(depth / 8.0)); dim3 gridDimensionsDecremented = new dim3((int)Math.Ceiling(widthD / 8.0), (int)Math.Ceiling(heightD / 8.0), (int)Math.Ceiling(depthD / 8.0)); CUDANoiseCube noiseCube = new CUDANoiseCube(); CudaArray3D noiseArray = noiseCube.GenerateUniformArray(16, 16, 16); CudaTextureArray3D noiseTexture = new CudaTextureArray3D(kernelPositionWeight, "noiseTexture", CUAddressMode.Wrap, CUFilterMode.Linear, CUTexRefSetFlags.NormalizedCoordinates, noiseArray); CudaDeviceVariable<Voxel> voxelsDev = new CudaDeviceVariable<Voxel>(count); kernelPositionWeight.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelPositionWeight, gridDimensions); kernelPositionWeight.Run(voxelsDev.DevicePointer, width, height, depth); kernelNormalAmbient.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelNormalAmbient, gridDimensions); kernelNormalAmbient.Run(voxelsDev.DevicePointer, width, height, depth, container.Settings.AmbientRayWidth, container.Settings.AmbientSamplesCount); int nearestW = NearestPowerOfTwo(widthD); int nearestH = NearestPowerOfTwo(heightD); int nearestD = NearestPowerOfTwo(depthD); int nearestCount = nearestW * nearestH * nearestD; CudaDeviceVariable<int> trisCountDevice = new CudaDeviceVariable<int>(nearestCount); trisCountDevice.Memset(0); CudaDeviceVariable<int> offsetsDev = new CudaDeviceVariable<int>(countDecremented); kernelMarchingCubesCases.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesCases, gridDimensionsDecremented); kernelMarchingCubesCases.Run(voxelsDev.DevicePointer, width, height, depth, offsetsDev.DevicePointer, trisCountDevice.DevicePointer, nearestW, nearestH, nearestD); CudaDeviceVariable<int> prefixSumsDev = prefixScan.PrefixSumArray(trisCountDevice, nearestCount); int lastTrisCount = 0; trisCountDevice.CopyToHost(ref lastTrisCount, (nearestCount - 1) * sizeof(int)); int lastPrefixSum = 0; prefixSumsDev.CopyToHost(ref lastPrefixSum, (nearestCount - 1) * sizeof(int)); int totalVerticesCount = (lastTrisCount + lastPrefixSum) * 3; if (totalVerticesCount > 0) { if (container.Geometry != null) container.Geometry.Dispose(); container.VertexCount = totalVerticesCount; container.Geometry = new Buffer(graphicsDevice, new BufferDescription() { BindFlags = BindFlags.VertexBuffer, CpuAccessFlags = CpuAccessFlags.None, OptionFlags = ResourceOptionFlags.None, SizeInBytes = Marshal.SizeOf(typeof(VoxelMeshVertex)) * totalVerticesCount, Usage = ResourceUsage.Default }); CudaDirectXInteropResource directResource = new CudaDirectXInteropResource(container.Geometry.ComPointer, CUGraphicsRegisterFlags.None, CudaContext.DirectXVersion.D3D11, CUGraphicsMapResourceFlags.None); kernelMarchingCubesVertices.BlockDimensions = blockDimensions; typeof(CudaKernel).GetField("_gridDim", BindingFlags.Instance | BindingFlags.NonPublic).SetValue(kernelMarchingCubesVertices, gridDimensionsDecremented); directResource.Map(); kernelMarchingCubesVertices.Run(directResource.GetMappedPointer(), voxelsDev.DevicePointer, prefixSumsDev.DevicePointer, offsetsDev.DevicePointer, width, height, depth, nearestW, nearestH, nearestD); directResource.UnMap(); directResource.Dispose(); } else { container.VertexCount = 0; if (container.Geometry != null) container.Geometry.Dispose(); } noiseCube.Dispose(); prefixSumsDev.Dispose(); trisCountDevice.Dispose(); offsetsDev.Dispose(); noiseArray.Dispose(); noiseTexture.Dispose(); voxelsDev.Dispose(); }
static void Main(string[] args) { const int nx = 2048; const int ny = 2048; // shifts applied to x and y data const int x_shift = 5; const int y_shift = 7; ShrQATest.shrQAStart(args); if ((nx%TILE_DIM != 0) || (ny%TILE_DIM != 0)) { Console.Write("nx and ny must be multiples of TILE_DIM\n"); ShrQATest.shrQAFinishExit(args, ShrQATest.eQAstatus.QA_WAIVED); } // execution configuration parameters dim3 grid = new dim3(nx/TILE_DIM, ny/TILE_DIM, 1); dim3 threads = new dim3(TILE_DIM, TILE_DIM, 1); // This will pick the best possible CUDA capable device int devID = findCudaDevice(args); //Load Kernel image from resources string resName; if (IntPtr.Size == 8) resName = "simplePitchLinearTexture_x64.ptx"; else resName = "simplePitchLinearTexture.ptx"; string resNamespace = "simplePitchLinearTexture"; string resource = resNamespace + "." + resName; Stream stream = Assembly.GetExecutingAssembly().GetManifestResourceStream(resource); if (stream == null) throw new ArgumentException("Kernel not found in resources."); byte[] kernels = new byte[stream.Length]; int bytesToRead = (int)stream.Length; while (bytesToRead > 0) { bytesToRead -= stream.Read(kernels, (int)stream.Position, bytesToRead); } CudaKernel PLKernel = ctx.LoadKernelPTX(kernels, "shiftPitchLinear"); CudaKernel ArrayKernel = ctx.LoadKernelPTX(kernels, "shiftArray"); CudaStopWatch stopwatch = new CudaStopWatch(); // ---------------------------------- // Host allocation and initialization // ---------------------------------- float[] h_idata = new float[nx * ny]; float[] h_odata = new float[nx * ny]; float[] gold = new float[nx * ny]; for (int i = 0; i < nx * ny; ++i) h_idata[i] = (float)i; // ------------------------ // Device memory allocation // ------------------------ // Pitch linear input data CudaPitchedDeviceVariable<float> d_idataPL = new CudaPitchedDeviceVariable<float>(nx, ny); // Array input data CudaArray2D d_idataArray = new CudaArray2D(CUArrayFormat.Float, nx, ny, CudaArray2DNumChannels.One); // Pitch linear output data CudaPitchedDeviceVariable<float> d_odata = new CudaPitchedDeviceVariable<float>(nx, ny); // ------------------------ // copy host data to device // ------------------------ // Pitch linear d_idataPL.CopyToDevice(h_idata); // Array d_idataArray.CopyFromHostToThis<float>(h_idata); // ---------------------- // Bind texture to memory // ---------------------- // Pitch linear CudaTextureLinearPitched2D<float> texRefPL = new CudaTextureLinearPitched2D<float>(PLKernel, "texRefPL", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, CUArrayFormat.Float, d_idataPL); CudaTextureArray2D texRefArray = new CudaTextureArray2D(ArrayKernel, "texRefArray", CUAddressMode.Wrap, CUFilterMode.Point, CUTexRefSetFlags.NormalizedCoordinates, d_idataArray); // --------------------- // reference calculation // --------------------- for (int j = 0; j < ny; j++) { int jshift = (j + y_shift) % ny; for (int i = 0; i < nx; i++) { int ishift = (i + x_shift) % nx; gold[j * nx + i] = h_idata[jshift * nx + ishift]; } } // ---------------- // shiftPitchLinear // ---------------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); PLKernel.BlockDimensions = threads; PLKernel.GridDimensions = grid; stopwatch.Start(); for (int i=0; i < NUM_REPS; i++) { PLKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch/sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timePL = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); bool res = cutComparef(gold, h_odata); bool success = true; if (res == false) { Console.Write("*** shiftPitchLinear failed ***\n"); success = false; } // ---------- // shiftArray // ---------- ctx.ClearMemory(d_odata.DevicePointer, 0, d_odata.TotalSizeInBytes); ArrayKernel.BlockDimensions = threads; ArrayKernel.GridDimensions = grid; stopwatch.Start(); for (int i=0; i < NUM_REPS; i++) { ArrayKernel.Run(d_odata.DevicePointer, (int)(d_odata.Pitch/sizeof(float)), nx, ny, x_shift, y_shift); } stopwatch.Stop(); stopwatch.StopEvent.Synchronize(); float timeArray = stopwatch.GetElapsedTime(); // check results d_odata.CopyToHost(h_odata); res = cutComparef(gold, h_odata); if (res == false) { Console.Write("*** shiftArray failed ***\n"); success = false; } float bandwidthPL = 2.0f*1000.0f*nx*ny*sizeof(float)/(1e+9f)/(timePL/NUM_REPS); float bandwidthArray = 2.0f*1000.0f*nx*ny*sizeof(float)/(1e+9f)/(timeArray/NUM_REPS); Console.Write("\nBandwidth (GB/s) for pitch linear: {0}; for array: {1}\n", bandwidthPL, bandwidthArray); float fetchRatePL = nx*ny/1e+6f/(timePL/(1000.0f*NUM_REPS)); float fetchRateArray = nx*ny/1e+6f/(timeArray/(1000.0f*NUM_REPS)); Console.Write("\nTexture fetch rate (Mpix/s) for pitch linear: {0}; for array: {1}\n\n", fetchRatePL, fetchRateArray); // cleanup texRefPL.Dispose(); texRefArray.Dispose(); d_idataPL.Dispose(); d_idataArray.Dispose(); d_odata.Dispose(); stopwatch.Dispose(); ctx.Dispose(); ShrQATest.shrQAFinishExit(args, (success == true) ? ShrQATest.eQAstatus.QA_PASSED : ShrQATest.eQAstatus.QA_FAILED); }
void addForces(CudaPitchedDeviceVariable<float2> v, int dx, int dy, int spx, int spy, float fx, float fy, int r, SizeT tPitch) { dim3 tids = new dim3((uint)(2 * r + 1), (uint)(2 * r + 1), 1); addForces_k.GridDimensions = new dim3(1); addForces_k.BlockDimensions = tids; addForces_k.Run(v.DevicePointer, dx, dy, spx, spy, fx, fy, r, tPitch); }
private void generateKernels(string forwardName, string backName, string clrName, string activeName, dim3 kernelSize) { forward = ctx.LoadKernel("kernel.ptx", forwardName); forward.GridDimensions = new dim3(size.x, size.y, size.z); forward.BlockDimensions = kernelSize; back = ctx.LoadKernel("kernel.ptx", backName); back.GridDimensions = new dim3(size.x, size.y, size.z); back.BlockDimensions = kernelSize; clear = ctx.LoadKernel("kernel.ptx", activeName); clear.GridDimensions = new dim3(size.x, size.y, size.z); activate = ctx.LoadKernel("kernel.ptx", activeName); activate.GridDimensions = new dim3(size.x, size.y, size.z); }
public Grid(dim3 gridDim, dim3 blockDim) { GridDim = gridDim; BlockDim = blockDim; }
public float3[] GetPointsGPU(int NumPoints) { int BlockSize = 512; if (NumPoints % BlockSize != 0) { throw new Exception("NumPoints must be divisible by " + BlockSize.ToString()); } int[] TriangleCounts = new int[GridCount + 1]; var Maxima = new float3[GridCount]; var Minima = new float3[GridCount]; TriangleCounts[0] = 0; for (int i = 0; i < GridCount; i++) { int LocalCount = TriangleCounts[i] + (int)Domains[i].TriangleCount; if (Domains[i].TriangleCount > BlockSize) { throw new Exception("STL File must have no more than " + BlockSize.ToString() + " Triangles"); } TriangleCounts[i + 1] = LocalCount; Minima[i] = STLReader.ToFloat3(Domains[i].Extrema.Min); Maxima[i] = STLReader.ToFloat3(Domains[i].Extrema.Max); } var Triangles = new TriangleSTL[TriangleCounts[GridCount]]; int id = 0; for (int i = 0; i < GridCount; i++) { for (int j = 0; j < TriangleCounts[i]; j++) { var LocalTri = Domains[i].Triangles[j]; Triangles[id] = new TriangleSTL(LocalTri); id++; } } var ctx = new CudaContext(1); var DeviceInfo = ctx.GetDeviceInfo(); var d_Triangles = new CudaDeviceVariable <TriangleSTL>(Triangles.Length); var d_TriangleCounts = new CudaDeviceVariable <int>(GridCount); var d_Minima = new CudaDeviceVariable <float3>(GridCount); var d_Maxima = new CudaDeviceVariable <float3>(GridCount); var d_Points = new CudaDeviceVariable <float3>(GridCount * NumPoints); var h_Points = new float3[GridCount * NumPoints]; var rng = new Random(0); // use a sequence that is repeatable over and over again for (int i = 0; i < GridCount * NumPoints; i++) { h_Points[i].x = (float)rng.NextDouble(); h_Points[i].y = (float)rng.NextDouble(); h_Points[i].z = (float)rng.NextDouble(); } int ctr = 0; for (int i = 0; i < GridCount; i++) { for (int j = 0; j < NumPoints; j++) { h_Points[ctr].x = Minima[i].x + h_Points[ctr].x * (Maxima[i].x - Minima[i].x); h_Points[ctr].y = Minima[i].y + h_Points[ctr].y * (Maxima[i].y - Minima[i].y); h_Points[ctr].z = Minima[i].z + h_Points[ctr].z * (Maxima[i].z - Minima[i].z); ctr++; } } d_Points = h_Points; d_Triangles = Triangles; d_TriangleCounts = TriangleCounts; d_Minima = Minima; d_Maxima = Maxima; // copy over to host // TODO generate grid on GPU instead of CPU var PointInPolygonKernel = ctx.LoadKernelPTX("PointInPolygon.ptx", "PointInPolygon"); var BlockDim = new dim3(BlockSize, 1, 1); var GridDim = new dim3(GridCount, 1, 1); PointInPolygonKernel.BlockDimensions = BlockDim; PointInPolygonKernel.GridDimensions = GridDim; PointInPolygonKernel.Run(GridCount, NumPoints, d_TriangleCounts.DevicePointer, d_Triangles.DevicePointer, d_Maxima.DevicePointer, d_Minima.DevicePointer, d_Points.DevicePointer); h_Points = d_Points; return(h_Points); // TODO Fix this to remove bad points }
private CudaKernel InitializeGridsAndThreads(string kernalName, Matrix a) { int maxThreads = Math.Min(this.maxThreadPerBlockDim, a.Row); dim3 threads = a.Column == 1 ? new dim3(maxThreads, 1) : new dim3(maxThreads, maxThreads); dim3 blocks = new dim3((a.Row + maxThreads - 1) / maxThreads, (a.Column + maxThreads - 1) / maxThreads); CudaKernel kernel = new CudaKernel(kernalName, this.cuModule, this.cudaContext) { GridDimensions = blocks, BlockDimensions = threads }; return kernel; }
/// <summary> /// Invokes the specified context. /// </summary> /// <param name="context">The context.</param> /// <param name="cudaContext">The cuda context.</param> /// <param name="kernelName">Name of the kernel.</param> /// <param name="grid">The grid.</param> /// <param name="block">The block.</param> /// <param name="smemSize">Size of the smem.</param> /// <param name="stream">The stream.</param> /// <param name="args">The arguments.</param> private void Invoke(TSCudaContext context, CudaContext cudaContext, string kernelName, dim3 grid, dim3 block, uint smemSize, CUstream stream, params object[] args) { var ptx = GetPtx(context.Compiler); var kernel = context.KernelCache.Get(cudaContext, ptx, kernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.DynamicSharedMemory = smemSize; kernel.RunAsync(stream, args); }
void diffuseProject(CudaDeviceVariable<cData> vx, CudaDeviceVariable<cData> vy, int dx, int dy, float dt, float visc, SizeT tPitch) { // Forward FFT planr2c.Exec(vx.DevicePointer); planr2c.Exec(vy.DevicePointer); dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); diffuseProject_k.GridDimensions = grid; diffuseProject_k.BlockDimensions = tids; diffuseProject_k.Run(vx.DevicePointer, vy.DevicePointer, dx, dy, dt, visc, TILEY / TIDSY); // Inverse FFT planc2r.Exec(vx.DevicePointer); planc2r.Exec(vy.DevicePointer); }
/// <summary> /// NOTE: Compute Capability 3.5 and later only. Dynamic parallelism. Call from a single thread. /// Not supported by emulator. /// </summary> /// <param name="gridSize">Size of grid.</param> /// <param name="blockSize">Size of block.</param> /// <param name="functionName">Name of function to launch.</param> /// <param name="args">Arguments.</param> public static int Launch(this GThread thread, dim3 gridSize, dim3 blockSize, string functionName, params object[] args) { ThrowNotSupported(); return 0; }
void updateVelocity(CudaPitchedDeviceVariable<cData> v, CudaDeviceVariable<cData> vx, CudaDeviceVariable<cData> vy, int dx, int pdx, int dy, SizeT tPitch) { dim3 grid = new dim3((uint)((dx / TILEX) + (!(dx % TILEX != 0) ? 0 : 1)), (uint)((dy / TILEY) + (!(dy % TILEY != 0) ? 0 : 1)), 1); dim3 tids = new dim3(TIDSX, TIDSY, 1); updateVelocity_k.GridDimensions = grid; updateVelocity_k.BlockDimensions = tids; updateVelocity_k.Run(v.DevicePointer, vx.DevicePointer, vy.DevicePointer, dx, pdx, dy, TILEY / TIDSY, tPitch); }
public static void InvokeReduce(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string kernelName, dim3 grid, dim3 block, uint smemSize, ApplySpecialization spec, params object[] args) { ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); var kernel = context.KernelCache.Get(cudaContext, ptx, kernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.DynamicSharedMemory = smemSize; kernel.Run(args); }
public static extern CUResult cuMemcpyDtoH_v2(ref dim3 dstHost, CUdeviceptr srcDevice, SizeT ByteCount);
public void SetDims(dim3 gridDim, dim3 blockDim) { GridDim = gridDim; BlockDim = blockDim; }
public static TestingStructure[] CorrectColour(System.Drawing.Color rgb, double X, double Y, double Z) { //rgb = System.Drawing.Color.FromArgb(69, 77, 217); //X = 0.0630982813175294; //Y = 0.616476271122916; //Z = 0.667048468232457; //cuda intializer CudafyModule km = CudafyModule.TryDeserialize(); if (km == null || !km.TryVerifyChecksums()) { // km = CudafyTranslator.Cudafy((typeof(ForeGroundStrucuture)), (typeof(BackGroundStrucuture)), typeof(Color)); km = CudafyTranslator.Cudafy((typeof(ProfileStrucuture)), (typeof(ForeGroundStrucuture)), (typeof(BackGroundStrucuture)), (typeof(SampleStructure)), (typeof(TestingStructure)), typeof(snake)); km.TrySerialize(); } CudafyTranslator.GenerateDebug = true; // cuda or emulator GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, CudafyModes.DeviceId); //GPGPU gpu = CudafyHost.GetDevice(eGPUType.Emulator); Console.WriteLine("Running snake correction using {0}", gpu.GetDeviceProperties(false).Name); gpu.LoadModule(km); TestingStructure[] distance_CPU = new TestingStructure[1]; // allocate memory on the GPU for the bitmap (same size as ptr) #region DataTable profile = new DataTable(); try { // add the csv bin file using (GenericParserAdapter parser = new GenericParserAdapter(@"C:\lev\STColorCorrection\Data\PROFILE\p3700.csv")) { System.Data.DataSet dsResult = parser.GetDataSet(); profile = dsResult.Tables[0]; } } catch (Exception ex) { Console.WriteLine(ex); } #endregion // allocate temp memory, initialize it, copy to constant memory on the GPU // L 0-21 A 0-41 B 0-45 ProfileStrucuture[, ,] profiles_CPU = new ProfileStrucuture[21, 41, 45]; ForeGroundStrucuture[] foregorungRGB_CPU = new ForeGroundStrucuture[1]; BackGroundStrucuture[] BackgroundXYZ_CPU = new BackGroundStrucuture[1]; SampleStructure[,] samples_CPU = new SampleStructure[1, 6]; //profile inicialization #region for (int indexL = 0; indexL < 21; indexL++) { for (int indexA = 0; indexA < 41; indexA++) { for (int indexB = 0; indexB < 45; indexB++) { profiles_CPU[indexL, indexA, indexB].L = indexL; profiles_CPU[indexL, indexA, indexB].A = indexA; profiles_CPU[indexL, indexA, indexB].B = indexB; //profiles_CPU[indexL, indexA, indexB].Given_R = 0; //profiles_CPU[indexL, indexA, indexB].Given_G = 0; //profiles_CPU[indexL, indexA, indexB].Given_B = 0; profiles_CPU[indexL, indexA, indexB].ML = 0; profiles_CPU[indexL, indexA, indexB].MA = 0; profiles_CPU[indexL, indexA, indexB].MB = 0; profiles_CPU[indexL, indexA, indexB].MX = 0; profiles_CPU[indexL, indexA, indexB].MY = 0; profiles_CPU[indexL, indexA, indexB].MZ = 0; profiles_CPU[indexL, indexA, indexB].distance = -1.0; profiles_CPU[indexL, indexA, indexB].weight = -1.0; profiles_CPU[indexL, indexA, indexB].isempty = TRUE; profiles_CPU[indexL, indexA, indexB].isMoreAccurateThanOrigin = FALSE; } } } int lvalue, avalue, bvalue; try { for (int i = 1; i < profile.Rows.Count; i++) { lvalue = Convert.ToInt32(profile.Rows[i][0].ToString()); avalue = Convert.ToInt32(profile.Rows[i][1].ToString()); bvalue = Convert.ToInt32(profile.Rows[i][2].ToString()); lvalue = (int)(lvalue * 0.2); avalue = (int)(avalue * 0.2) + 20; bvalue = (int)(bvalue * 0.2) + 22; profiles_CPU[lvalue, avalue, bvalue].L = lvalue; profiles_CPU[lvalue, avalue, bvalue].A = avalue; profiles_CPU[lvalue, avalue, bvalue].B = bvalue; //profiles_CPU[lvalue, avalue, bvalue].Given_R = (double)Convert.ToByte(profile.Rows[i][9].ToString()); //profiles_CPU[lvalue, avalue, bvalue].Given_G = (double)Convert.ToByte(profile.Rows[i][10].ToString()); //profiles_CPU[lvalue, avalue, bvalue].Given_B = (double)Convert.ToByte(profile.Rows[i][11].ToString()); profiles_CPU[lvalue, avalue, bvalue].ML = (double)Convert.ToDouble(profile.Rows[i][3].ToString()); profiles_CPU[lvalue, avalue, bvalue].MA = (double)Convert.ToDouble(profile.Rows[i][4].ToString()); profiles_CPU[lvalue, avalue, bvalue].MB = (double)Convert.ToDouble(profile.Rows[i][5].ToString()); profiles_CPU[lvalue, avalue, bvalue].MX = (double)Convert.ToDouble(profile.Rows[i][6].ToString()); profiles_CPU[lvalue, avalue, bvalue].MY = (double)Convert.ToDouble(profile.Rows[i][7].ToString()); profiles_CPU[lvalue, avalue, bvalue].MZ = (double)Convert.ToDouble(profile.Rows[i][8].ToString()); profiles_CPU[lvalue, avalue, bvalue].isempty = FALSE; } } catch (Exception ex) { Console.WriteLine(ex); } #endregion //grab the colors ProfileStrucuture[, ,] profile_GPU = gpu.CopyToDevice(profiles_CPU); SampleStructure[,] samples_GPU = gpu.CopyToDevice(samples_CPU); Point3D background = new Point3D(X, Y, Z); //foreground and background image inicialization #region try { for (int i = 0; i < 1; i++) { foregorungRGB_CPU[i].R = rgb.R; foregorungRGB_CPU[i].G = rgb.G; foregorungRGB_CPU[i].B = rgb.B; BackgroundXYZ_CPU[i].X = background.X; BackgroundXYZ_CPU[i].Y = background.Y; BackgroundXYZ_CPU[i].Z = background.Z; } } catch (Exception ex) { Console.WriteLine(ex); } #endregion //begin execution // capture the start time gpu.StartTimer(); ForeGroundStrucuture[] foregorungRGB_GPU = gpu.CopyToDevice(foregorungRGB_CPU); BackGroundStrucuture[] BackgroundXYZ_GPU = gpu.CopyToDevice(BackgroundXYZ_CPU); //out put TestingStructure[] distance_GPU = gpu.Allocate(distance_CPU); // generate a bitmap from our sphere data //Image size: 1024 x 768 //dim3 grids = new dim3(1024 / 16, 768 / 16); //dim3 threads = new dim3(16, 16); dim3 grids = new dim3(1, 1); dim3 threads = new dim3(1, 1); //quick_correct //gpu.Launch(grids, threads, ((Action<GThread, ProfileStrucuture[, ,], ForeGroundStrucuture[], BackGroundStrucuture[], ProfileStrucuture[], SampleStructure[,]>)QuickCorr), profile_GPU, foregorungRGB_GPU, BackgroundXYZ_GPU, distance_GPU, samples_GPU); //quick correct - testing gpu.Launch(grids, threads, ((Action <GThread, ProfileStrucuture[, , ], ForeGroundStrucuture[], BackGroundStrucuture[], TestingStructure[], SampleStructure[, ]>)QuickCorr), profile_GPU, foregorungRGB_GPU, BackgroundXYZ_GPU, distance_GPU, samples_GPU); // copy our bitmap back from the GPU for display gpu.CopyFromDevice(distance_GPU, distance_CPU); // get stop time, and display the timing results double elapsedTime = gpu.StopTimer(); distance_CPU[0].execution_time = elapsedTime; Console.WriteLine("Time to generate: {0} ms", elapsedTime); gpu.Free(foregorungRGB_GPU); gpu.Free(BackgroundXYZ_GPU); gpu.Free(distance_GPU); gpu.FreeAll(); return(distance_CPU); }
public KernelResult Run(dim3 gridDim, dim3 blockDim, IEnumerable<KernelArgument> args) { var invocation = new KernelInvocation(this, args); return invocation.Launch(gridDim, blockDim); }