public static void Cuda( Real[] mSquaredDistances, Real[] mCoordinates, int c, int n) { var gpu = Gpu.Default; using (var cudaSquaredDistance = gpu.AllocateDevice(mSquaredDistances)) using (var cudaCoordinates = gpu.AllocateDevice(mCoordinates)) { var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n * n, blockSize); var lp = new LaunchParam(gridSize, blockSize); gpu.Launch(Kernel, lp, cudaSquaredDistance.Ptr, cudaCoordinates.Ptr, c, n); gpu.Synchronize(); Util.PrintPerformance(timer, "SquaredDistance.Cuda", n, c, n); Gpu.Copy(cudaSquaredDistance, mSquaredDistances); } }
public void Mult(int wA, int wB, int hC, deviceptr<double> A, deviceptr<double> B, deviceptr<double> C) { var block = new dim3(BlockSize, BlockSize); var grid = new dim3(wB/block.x, hC/block.y); var lp = new LaunchParam(grid, block); GPULaunch(Kernel, lp, wA, wB, A, B, C); }
private AdaptiveLBP(Size size) { this.size = size; // initialize data structures to avoid reallocating with every call hist = new int[numSubuniformPatterns * numVarBins]; hist2 = new int[numSubuniformPatterns * numVarBins]; lbpImageGPU = worker.Malloc <short>(size.Width * size.Height); varImageGPU = worker.Malloc <short>(size.Width * size.Height); histGPU = worker.Malloc <int>(hist.Length); floatImageGPU = worker.Malloc <float>(size.Width * size.Height); // precompute the subuniform bin for each LBP pattern, and push it to the GPU subuniformBins = new short[(short)Math.Pow(2, numNeighbors)]; for (int i = 0; i < subuniformBins.Length; i++) { short bin = GetPatternNum(i); subuniformBins[i] = bin; } subuniformBinsGPU = worker.Malloc(subuniformBins); neighborCoordinateX = new float[numNeighbors]; neighborCoordinateY = new float[numNeighbors]; for (int i = 0; i < numNeighbors; i++) { float xx = (float)Math.Cos(2.0 * PI * (double)i / (double)numNeighbors); float yy = (float)Math.Sin(2.0 * PI * (double)i / (double)numNeighbors); neighborCoordinateX[i] = xx; neighborCoordinateY[i] = yy; } neighborCoordinateXGPU = worker.Malloc(neighborCoordinateX); neighborCoordinateYGPU = worker.Malloc(neighborCoordinateY); varBinsGPU = worker.Malloc(varBins); // initialize CUDA parameters var blockDims = new dim3(8, 8); var gridDims = new dim3(Common.divup(size.Width, blockDims.x), Common.divup(size.Height, blockDims.y)); lp = new LaunchParam(gridDims, blockDims); // create filters for (int i = 0; i < numScales; i++) { float[,] filter = LaplacianOfGaussian.Generate(i + 1); filters[i] = Utils.Flatten(filter); filtersGPU[i] = worker.Malloc(filters[i]); filterSizes[i] = (filter.GetLength(0) - 1) / 2; } // allocate space for scale space images deviceptr <float>[] tempPointers = new deviceptr <float> [numScales]; for (int i = 0; i < numScales; i++) { scaledImages[i] = worker.Malloc <float>(size.Width * size.Height); tempPointers[i] = scaledImages[i].Ptr; } scaledImagePointers = worker.Malloc(tempPointers); pixelScaleImage = worker.Malloc <short>(size.Width * size.Height); }
public static Matrix <FieldType> GpuMultiply <FieldType, GpuStructType>(Matrix <FieldType> left, Matrix <FieldType> right) where FieldType : Field <FieldType>, IGpuCompatibleField <FieldType, GpuStructType>, new() where GpuStructType : struct { if (left.Width != right.Height) { throw new InvalidOperationException("Matrices of incompatible sizes can't be multiplied."); } IGpuStructManager <FieldType, GpuStructType> gpuStructManager = new FieldType().GetDefaultGpuStructManager(); GpuStructType[,] resultArr = new GpuStructType[left.Rows, right.Columns]; GpuStructType[,] leftArr = new GpuStructType[left.Rows, left.Columns]; GpuStructType[,] rightArr = new GpuStructType[right.Rows, right.Columns]; resultArr.AssignAll(gpuStructManager.GetStructDefaultValue()); leftArr.AssignAll(ind => gpuStructManager.ToStruct(left[ind[0], ind[1]])); rightArr.AssignAll(ind => gpuStructManager.ToStruct(right[ind[0], ind[1]])); Alea.Gpu gpu = Alea.Gpu.Default; int threadCount = left.Rows * right.Columns; int blockDimX = gpu.Device.Attributes.MaxThreadsPerBlock; // Threads per block int gridDimX = (int)Math.Ceiling((double)threadCount / blockDimX); // Blocks per thread LaunchParam lp = new LaunchParam(gridDimX, blockDimX); gpu.Launch(multiplicationKernel, lp, leftArr, rightArr, resultArr, gpuStructManager.GetStructAddition(), gpuStructManager.GetStructMultiplication()); FieldType[,] fieldResultArr = new FieldType[resultArr.GetLength(0), resultArr.GetLength(1)]; fieldResultArr.AssignAll(ind => gpuStructManager.ToClass(resultArr[ind[0], ind[1]])); return(new Matrix <FieldType>(fieldResultArr)); }
//[/GenericReduceSPUK] ///[genericReduceScalarProdUse] public T Apply(T[] values1, T[] values2) { var n = values1.Length; var numSm = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT; var tup = _plan.BlockRanges(numSm, n); var ranges = tup.Item1; var numRanges = tup.Item2; var lpUpsweep = new LaunchParam(numRanges, _plan.NumThreads); var lpReduce = new LaunchParam(1, _plan.NumThreadsReduction); using (var dRanges = GPUWorker.Malloc(ranges)) using (var dRangeTotals = GPUWorker.Malloc <T>(numRanges)) using (var dValues1 = GPUWorker.Malloc(values1)) using (var dValues2 = GPUWorker.Malloc(values2)) { // Launch range reduction kernel to calculate the totals per range. GPUWorker.EvalAction( () => { GPULaunch(Upsweep, lpUpsweep, dValues1.Ptr, dValues2.Ptr, dRanges.Ptr, dRangeTotals.Ptr); if (numRanges > 1) { // Need to aggregate the block sums as well. GPULaunch(_reduce.ReduceRangeTotals, lpReduce, numRanges, dRangeTotals.Ptr); } }); return(dRangeTotals.Gather()[0]); } }
//[/cuRANDComputeValue] //[cuRANDPiEstimator] public double RunEstimation(int numSims, int threadBlockSize) { // Aim to launch around ten or more times as many blocks as there // are multiprocessors on the target device. const int blocksPerSm = 10; var numSMs = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT; // Determine how to divide the work between cores var block = new dim3(threadBlockSize); var grid = new dim3((numSims + threadBlockSize - 1) / threadBlockSize); while (grid.x > 2 * blocksPerSm * numSims) { grid.x >>= 1; } var n = 2 * numSims; using (var dPoints = GPUWorker.Malloc <double>(n)) using (var dResults = GPUWorker.Malloc <double>(grid.x)) { // Generate random points in unit square var curand = new CURAND(GPUWorker, CURANDInterop.curandRngType.CURAND_RNG_QUASI_SOBOL64); curand.SetQuasiRandomGeneratorDimensions(2); curand.SetGeneratorOrdering(CURANDInterop.curandOrdering.CURAND_ORDERING_QUASI_DEFAULT); curand.GenerateUniformDouble(dPoints.Ptr, new IntPtr(n)); var lp = new LaunchParam(grid, block, block.x * sizeof(uint)); GPULaunch(ComputeValue, lp, dResults.Ptr, dPoints.Ptr, numSims); var value = dResults.Gather().Sum(); return((value / numSims) * 4.0); } }
public static void Cuda( Real[] mIntraReturn, Real[] vClose, Real[] vIsAlive, Real[] vIsValidDay, int m, int n) { var gpu = Gpu.Default; using (var cudaIntraReturn = gpu.AllocateDevice(mIntraReturn)) using (var cudaClose = gpu.AllocateDevice(vClose)) using (var cudaIsAlive = gpu.AllocateDevice(vIsAlive)) using (var cudaIsValidDay = gpu.AllocateDevice(vIsValidDay)) { var timer = Stopwatch.StartNew(); var gridSizeX = Util.DivUp(n, 32); var gridSizeY = Util.DivUp(m, 8); var lp = new LaunchParam(new dim3(gridSizeX, gridSizeY), new dim3(32, 8)); gpu.Launch(CudaKernel, lp, cudaIntraReturn.Ptr, cudaClose.Ptr, cudaIsAlive.Ptr, cudaIsValidDay.Ptr, m, n); gpu.Synchronize(); Util.PrintPerformance(timer, "IntraReturn.Cuda", 5, m, n); Gpu.Copy(cudaIntraReturn, mIntraReturn); } }
//[/GenericScanDownsweepKernel] public T[] Apply(T[] input, bool inclusive) { var n = input.Length; var numSm = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT; var tup = Plan.BlockRanges(numSm, n); var ranges = tup.Item1; var numRanges = tup.Item2; var lpUpsweep = new LaunchParam(numRanges, Plan.NumThreads); var lpReduce = new LaunchParam(1, Plan.NumThreadsReduction); var lpDownsweep = new LaunchParam(numRanges, Plan.NumThreads); var _inclusive = inclusive ? 1 : 0; using (var dRanges = GPUWorker.Malloc(ranges)) using (var dRangeTotals = GPUWorker.Malloc <T>(numRanges + 1)) using (var dInput = GPUWorker.Malloc(input)) using (var dOutput = GPUWorker.Malloc(input)) { _reduceModule.Upsweep(lpUpsweep, dInput.Ptr, dRanges.Ptr, dRangeTotals.Ptr); GPULaunch(ScanReduce, lpReduce, numRanges, dRangeTotals.Ptr); GPULaunch(Downsweep, lpDownsweep, dInput.Ptr, dOutput.Ptr, dRangeTotals.Ptr, dRanges.Ptr, _inclusive); return(dOutput.Gather()); } }
private static void AleaOptimisedImpl( Gpu gpu, Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <deviceptr <Real>, deviceptr <Real>, Constant <int>, Constant <int>, int, int> kernel) { using var cudaSquaredDistance = gpu.AllocateDevice <Real>(n, n); using var cudaCoordinates = gpu.AllocateDevice(mCoordinates); var timer = Stopwatch.StartNew(); const int blockSize = 256; var gridSize = Util.DivUp(n, blockSize); var lp = new LaunchParam(new dim3(gridSize, gridSize, 1), new dim3(blockSize, 1, 1)); var pitch = cudaSquaredDistance.PitchInElements.ToInt32(); gpu.Launch(kernel, lp, cudaSquaredDistance.Ptr, cudaCoordinates.Ptr, Gpu.Constant(blockSize), Gpu.Constant(c), n, pitch); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); Gpu.Copy2D(cudaSquaredDistance, mSquaredDistances, n, n); }
public static void flatten_ongpu(float[] x, int spatial, int layers, int batch, int forward, float[] output) { int size = spatial * batch * layers; var lp = new LaunchParam(CudaUtils.cuda_gridsize(size), new dim3(CudaUtils.BlockSize)); Gpu.Default.Launch(flatten_kernel, lp, size, x, spatial, layers, batch, forward, output); }
public static void reorg_ongpu(float[] x, int w, int h, int c, int batch, int stride, int forward, float[] output) { int size = w * h * c * batch; var lp = new LaunchParam(CudaUtils.cuda_gridsize(size), new dim3(CudaUtils.BlockSize)); Gpu.Default.Launch(reorg_kernel, lp, size, x, w, h, c, batch, stride, forward, output); }
public static void normalize_gpu(float[] x, float[] mean, float[] variance, int batch, int filters, int spatial) { var n = batch * filters * spatial; var lp = new LaunchParam(CudaUtils.cuda_gridsize(n), new dim3(CudaUtils.BlockSize)); Gpu.Default.Launch(normalize_kernel, lp, n, x, mean, variance, batch, filters, spatial); }
public static void fast_variance_delta_gpu(float[] x, float[] delta, float[] mean, float[] variance, int batch, int filters, int spatial, float[] varianceDelta) { var lp = new LaunchParam(CudaUtils.cuda_gridsize(filters), new dim3(CudaUtils.BlockSize)); Gpu.Default.Launch(fast_variance_delta_kernel, lp, x, delta, mean, variance, batch, filters, spatial, varianceDelta); }
// Fixed Block Size! internal static Image Render3(Bitmap image, ConvolutionFilter filter) { var gpu = Gpu.Default; var width = image.Width; var array = BitmapUtility.ToColorArray(image); var mFilter = filter.Filter; var mFactor = filter.Factor; var mOffset = filter.Offset; var inputMemory = gpu.ArrayGetMemory(array, true, false); var inputDevPtr = new deviceptr <ColorRaw>(inputMemory.Handle); var resultLength = array.Length; var resultMemory = Gpu.Default.AllocateDevice <ColorRaw>(resultLength); var resultDevPtr = new deviceptr <ColorRaw>(resultMemory.Handle); var lp = new LaunchParam(256, 256); gpu.Launch(() => { var i = blockDim.x * blockIdx.x + threadIdx.x; while (i < resultLength) { ComputeEdgeDetectFilter0AtOffsetNapron(inputDevPtr, resultDevPtr, resultLength, mFilter, mFactor, mOffset, i, width); i += blockDim.x * gridDim.x; } }, lp); return(BitmapUtility.FromColorArray(Gpu.CopyToHost(resultMemory), image.Width, image.Height)); }
private static void CudaOptimisedImpl <TInt>( Real[] mSquaredDistances, Real[] mCoordinates, int c, int n, string name, Action <deviceptr <float>, deviceptr <float>, TInt, int, int> kernel, Func <int, TInt> numCoordGetter) { var gpu = Gpu.Default; using (var cudaSquaredDistance = gpu.AllocateDevice <Real>(n, n)) using (var cudaCoordinates = gpu.AllocateDevice(mCoordinates)) { var timer = Stopwatch.StartNew(); const int blockSize = 128; var gridSize = Util.DivUp(n, blockSize); var lp = new LaunchParam(new dim3(gridSize, gridSize, 1), new dim3(blockSize, 1, 1), 2 * c * blockSize * sizeof(Real)); var pitch = cudaSquaredDistance.PitchInElements.ToInt32(); gpu.Launch(kernel, lp, cudaSquaredDistance.Ptr, cudaCoordinates.Ptr, numCoordGetter(c), n, pitch); gpu.Synchronize(); Util.PrintPerformance(timer, name, n, c, n); Gpu.Copy2D(cudaSquaredDistance, mSquaredDistances, n, n); } }
public static void scale_bias_gpu(float[] output, float[] biases, int batch, int n, int size) { var dimGrid = new dim3((size - 1) / CudaUtils.BlockSize + 1, n, batch); var dimBlock = new dim3(CudaUtils.BlockSize, 1, 1); var lp = new LaunchParam(dimGrid, dimBlock); Gpu.Default.Launch(scale_bias_kernel, lp, output, biases, n, size); }
public void Apply(int numSystems, int n, deviceptr <double> dl, deviceptr <double> dd, deviceptr <double> du, deviceptr <double> db, deviceptr <double> dx) { var sharedSize = 9 * n * sizeof(double); var lp = new LaunchParam(numSystems, n, sharedSize); this.GPULaunch(this.Kernel, lp, n, dl, dd, du, db, dx); }
public static void softmax_gpu(float[] input, int n, int offset, int groups, float temp, float[] output, int inputStart = 0, int outputStart = 0) { int inputs = n; int batch = groups; var lp = new LaunchParam(CudaUtils.cuda_gridsize(batch), new dim3(CudaUtils.BlockSize)); Gpu.Default.Launch(softmax_kernel, lp, inputs, offset, batch, input, temp, output, inputStart, outputStart); }
//[/transformKernel] //[transformGPUDevice] public void Apply(int n, deviceptr <T> x, deviceptr <T> y, deviceptr <T> z) { const int blockSize = 256; var numSm = this.GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT; var gridSize = Math.Min(16 * numSm, Common.divup(n, blockSize)); var lp = new LaunchParam(gridSize, blockSize); GPULaunch(Kernel, lp, n, x, y, z); }
protected override RlmCacheDataArray LaunchKernel(long[] rneurons, double[][] inputs, double[] from, double[] to, bool[] rneuronsCache, double[] fromCache, double[] toCache, int lparam1, int lparam2) { var resultArr = new long[rneurons.Length]; var lp = new LaunchParam(lparam1, lparam2); gpu.Launch(RlmAleaGpu.KernelCache, lp, rneurons, inputs, resultArr, from, to, rneuronsCache, fromCache, toCache); return(FindBestSolutionAndBuildCache(rneurons, resultArr, inputs, rneuronsCache)); }
protected override void LaunchKernel(long[] rneurons, double[][] inputs, double[] from, double[] to, int lparam1, int lparam2) { var resultArr = new long[rneurons.Length]; var lp = new LaunchParam(lparam1, lparam2); gpu.Launch(RlmAleaGpu.Kernel, lp, rneurons, inputs, resultArr, from, to); FindBestSolution(resultArr); }
public override void Forward(Executor executor) { var wh = executor.GetTensor(Wh); var wd = executor.GetTensor(Wd); var v = executor.GetTensor(V); var h = executor.GetTensor(EncoderHiddenStates).Reshape(SeqLength * Batch, -1); var d = executor.GetTensor(DecoderHiddenState); var whh = Dot(h, wh); // [n*b, EncoderHiddenSize] * [EncoderHiddenSize, AttentionDim] = [n*b, AttentionDim] var wdd = Dot(d, wd); // [b, DecoderHiddenSize] * [DecoderHiddenSize, AttentionDim] = [b, AttentionDim] var whd = Tanh(whh + wdd); // broadcasting to [n*b, AttentionDim] var u = Dot(whd, v); // [n*b, AttentionDim] * [AttentionDim] = [n*b] var expu = Exp(u.Reshape(SeqLength, Batch)); var softmax = expu / ReduceSum(expu, true, 0); // [n, b] executor.AssignTensor(Softmax, softmax); var ctx = executor.Context; if (ctx.Type == ContextType.Gpu && typeof(T) == typeof(float)) { var stream = ctx.ToGpuContext().Stream; var hPtr = h.Buffer.Ptr.Reinterpret <float>(); var softmaxPtr = executor.GetTensor(Softmax).Buffer.Ptr.Reinterpret <float>(); var attentionState = executor.GetTensor(AttentionState).Buffer.Ptr.Reinterpret <float>(); var batchSize = Batch; var seqLength = SeqLength; var encoderHiddenSize = EncoderHiddenSize; // strides for hPtr: [n*b, b, 1] // TODO proper size var lp = new LaunchParam(new dim3(batchSize / 32, encoderHiddenSize / 32, 1), new dim3(32, 32)); stream.Launch(() => { var batch = blockIdx.x * blockDim.x + threadIdx.x; var hidden = blockIdx.y * blockDim.y + threadIdx.y; if (batch < batchSize && hidden < EncoderHiddenSize) { var sum = 0.0f; for (var i = 0; i < seqLength; ++i) { var alpha = softmaxPtr[i * batchSize + batch]; sum += alpha * hPtr[i * seqLength * batchSize + batch * batchSize + hidden]; } attentionState[batch * encoderHiddenSize + hidden] = sum; } }, lp); } else { throw new NotImplementedException(); } }
public static void RunGpuWithAutomaticMemoryManagement() { var n = GetData(out var x, out var y); var result = new float[n]; var gpu = Gpu.Default; var lp = new LaunchParam(16, 256); gpu.Launch(Kernel, lp, result, x, y); }
public static void TestSimpleMultiply() { for (var iter = 1; iter <= 3; ++iter) { Console.WriteLine("====> Test SimpleMultiply with Alea GPU C# AOT instance usage (#.{0}) <====", iter); var timer = Stopwatch.StartNew(); var worker = Util.Worker; Console.WriteLine("GPU: {0}", worker.Device.Name); timer.Stop(); Console.WriteLine("Step 1) Runtime setup {0} ms", timer.Elapsed.TotalMilliseconds); timer.Restart(); using (var module = new InstanceUsageAOT(GPUModuleTarget.Worker(worker))) { module.GPUForceLoad(); timer.Stop(); Console.WriteLine("Step 2+3) Compile and Load module {0} ms", timer.Elapsed.TotalMilliseconds); const int factor = 8; var a = Util.RandomMatrix(100 * factor, 200 * factor); var b = Util.RandomMatrix(200 * factor, 300 * factor); var aRows = 100 * factor; var bCols = 300 * factor; var aCols_bRows = 200 * factor; var gridDim = new dim3(Util.Divup(bCols, TileSize), Util.Divup(aRows, TileSize)); var blockDim = new dim3(TileSize, TileSize); var lp = new LaunchParam(gridDim, blockDim); using (var devA = worker.Malloc(a)) using (var devB = worker.Malloc(b)) using (var devC = worker.Malloc<float>(aRows * bCols)) { timer.Restart(); module.GPULaunch(module.SimpleMultiplyKernel, lp, devA.Ptr, devB.Ptr, devC.Ptr, aRows, bCols, aCols_bRows); worker.Synchronize(); timer.Stop(); Console.WriteLine("Kernel launch first time {0} ms", timer.Elapsed.TotalMilliseconds); const int repetitions = 50; timer.Restart(); for (var i = 0; i < repetitions; ++i) { module.GPULaunch(module.SimpleMultiplyKernel, lp, devA.Ptr, devB.Ptr, devC.Ptr, aRows, bCols, aCols_bRows); } worker.Synchronize(); timer.Stop(); Console.WriteLine("Kernel launch average time {0} ms", (timer.Elapsed.TotalMilliseconds / (float)repetitions)); var c = devC.Gather(); Util.VerifyResult(a, b, c, aRows, bCols, aCols_bRows); } } } }
public void Test() { const int n = 32; var lp = new LaunchParam(1, n); using (var outputs = GPUWorker.Malloc <int>(n)) { this.MyGPULaunch(Kernel, lp, outputs.Ptr, 1, 3); Console.WriteLine("{0}", (outputs.Gather())[4]); } }
//[/DynamicStartKernel] //[DynamicPrepareAndLaunchKernel] public void IntegrateNbodySystem(deviceptr<float4> newPos, deviceptr<float4> oldPos, deviceptr<float4> vel, int numBodies, float deltaTime, float softeningSquared, float damping, int blockSize) { var numBlocks = Alea.CUDA.Utilities.Common.divup(numBodies, blockSize); var numTiles = Alea.CUDA.Utilities.Common.divup(numBodies, blockSize); var sharedMemSize = blockSize * Operators.SizeOf<float4>(); var lp = new LaunchParam(numBlocks, blockSize, sharedMemSize); GPULaunch(IntegrateBodies, lp, newPos, oldPos, vel, numBodies, deltaTime, softeningSquared, damping, numTiles); }
//[/StaticStartKernel] //[StaticPrepareAndLaunchKernel] public void IntegrateNbodySystem(deviceptr <float4> newPos, deviceptr <float4> oldPos, deviceptr <float4> vel, int numBodies, float deltaTime, float softeningSquared, float damping) { var numBlocks = Alea.CUDA.Utilities.Common.divup(numBodies, _blockSize); var numTiles = Alea.CUDA.Utilities.Common.divup(numBodies, _blockSize); var lp = new LaunchParam(numBlocks, _blockSize); GPULaunch(IntegrateBodies, lp, newPos, oldPos, vel, numBodies, deltaTime, softeningSquared, damping, numTiles); }
public override void Backward(Executor executor) { var ctx = executor.Context; var indices = executor.GetTensor(Indices); var gradout = executor.GetGradient(Output); // for performance fix. if (ctx.Type == ContextType.Gpu && gradout.Layout.IsInnerChangeMostFullyPacked && indices.Layout.IsInnerChangeMostFullyPacked) { var embedDim = EmbedDim; var batchSize = (int)indices.Shape.Length; var threadSize = 256; // first set all to 0 executor.AssignGradient(Weights, Fill(executor.GetTensor(Weights).Shape, ScalarOps.Conv <T>(0.0))); var dW = executor.GetGradient(Weights); // then use a 1 block kernel to update it, cause usually the batch size is not huge, but the embedsize is huge! var stream = ctx.ToGpuContext().Stream; var iPtr = indices.Buffer.Ptr; // the following kernel is for 1 block, so there is no need for synchornization, // there could be further optimized. if (typeof(T) == typeof(float)) { var dOPtr = gradout.Buffer.Ptr.Reinterpret <float>(); var dWPtr = dW.Buffer.Ptr.Reinterpret <float>(); var lp = new LaunchParam(1, threadSize); //Console.WriteLine($"{indices.Shape} {gradout.Shape} {dW.Shape}"); stream.Launch(() => { for (var i = 0; i < batchSize; ++i) { var row = iPtr[i]; for (var k = threadIdx.x; k < embedDim; k += blockDim.x) { dWPtr[row * embedDim + k] += dOPtr[i * embedDim + k]; } } }, lp); return; } throw new NotImplementedException(); } else { executor.AssignGradient(Weights, TakeGrad(indices, gradout, EmbedSize)); } }
private void mRunKernel( ) { int kiLength = 1000; var koArg1 = Enumerable.Range(0, kiLength).ToArray( ); var koArg2 = Enumerable.Range(0, kiLength).ToArray( ); var koExpected = koArg1.Zip(koArg2, (x, y) => x + y); var koResult = new int[kiLength]; var koLP = new LaunchParam(4, 32); // Use 4 blocks with 32 threads per block this.voGpu.Launch(this.mKernel, koLP, koResult, koArg1, koArg2); Assert.That(koResult, Is.EqualTo(koExpected)); MessageBox.Show("Result[600] = " + koResult[600].ToString( )); }
/// <summary> /// Private Constructor /// </summary> Camera() { try { // pre-allocate and initialize the variables and data structures for brightness correction imgGPU = worker.Malloc <byte>(new byte[640 * 640]); meanImg = worker.Malloc <float>(new float[640 * 640]); addReduce = DeviceSumModuleF32.Default.Create(numPixels); correctionFactor = worker.Malloc <float>(640 * 640); float[] temp = new float[640 * 640]; for (int i = 0; i < temp.Length; i++) { temp[i] = 1; } correctionFactor.Scatter(temp); scalarOutput = worker.Malloc <float>(1); if (File.Exists("correctionFactor.dat")) { FileStream stream = new FileStream("correctionFactor.dat", FileMode.Open); byte[] buffer = new byte[640 * 640 * 4]; stream.Read(buffer, 0, (int)Math.Min(buffer.Length, stream.Length)); for (int i = 0; i < 640 * 640; i++) { temp[i] = BitConverter.ToSingle(buffer, 4 * i); } stream.Close(); correctionFactor.Scatter(temp); } // initialize CUDA parameters var blockDims = new dim3(32, 32); var gridDims = new dim3(Common.divup(640, blockDims.x), Common.divup(640, blockDims.y)); lp = new LaunchParam(gridDims, blockDims); // set up the camera parameters and events provider = new IduleProviderCsCam(0); provider.Initialize(); if (provider.IsConnected) { provider.ImageTransaction += provider_ImageTransaction; provider.Interrupt += provider_Interrupt; provider.Exception += camera_Exception; provider.WriteRegister(new NanEyeGSRegisterPayload(false, 0x05, true, 0, prescaler)); provider.WriteRegister(new NanEyeGSRegisterPayload(false, 0x06, true, 0, exposure)); ProcessingWrapper.pr[0].ReduceProcessing = true; } } catch (Exception ex) { OnError(ex.Message); } }
public static void col2im_ongpu(float[] dataCol, int channels, int height, int width, int ksize, int stride, int pad, float[] dataIm, int imStart = 0) { // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int heightCol = (height + 2 * pad - ksize) / stride + 1; int widthCol = (width + 2 * pad - ksize) / stride + 1; int numKernels = channels * height * width; var lp = new LaunchParam((numKernels + CudaUtils.BlockSize - 1) / CudaUtils.BlockSize, CudaUtils.BlockSize); Gpu.Default.Launch(col2im_gpu_kernel, lp, numKernels, dataCol, height, width, ksize, pad, stride, heightCol, widthCol, dataIm, imStart); }
//[/parallelSquareKernel] //[parallelSquareLaunch] static double[] SquareGPU(double[] inputs) { var worker = Worker.Default; using (var dInputs = worker.Malloc(inputs)) using (var dOutputs = worker.Malloc<double>(inputs.Length)) { const int blockSize = 256; var numSm = worker.Device.Attributes.MULTIPROCESSOR_COUNT; var gridSize = Math.Min(16 * numSm, Common.divup(inputs.Length, blockSize)); var lp = new LaunchParam(gridSize, blockSize); worker.Launch(SquareKernel, lp, dOutputs.Ptr, dInputs.Ptr, inputs.Length); return dOutputs.Gather(); } }
public static void MyGPULaunch <T1, T2, T3>( this ILGPUModule module, Action <T1, T2, T3> kernelD, LaunchParam lp, T1 arg1, T2 arg2, T3 arg3) { // get the kernel object by method name var kernel = module.GPUEntities.GetKernel(kernelD.Method.Name).Kernel; // create parameter list (which is FSharpList) var parameterArray = new object[] { arg1, arg2, arg3 }; var parameterList = ListModule.OfArray(parameterArray); // use untyped LaunchRaw to launch the kernel kernel.LaunchRaw(lp, parameterList); }
private void mRunMatrixMultiplyGPU3( ) { Stopwatch koSW = new Stopwatch( ); LaunchParam koLP; this.mClearMatrix(this.vfC); koSW.Start( ); koLP = new LaunchParam(new dim3(2, 2), new dim3(25, 25)); this.voGpu.Launch(MatrixMulCuda.MMultiplyKernel3, koLP, this.vfC, this.vfA, this.vfB); koSW.Stop( ); MessageBox.Show("C[200][200] = " + this.vfC[200][200].ToString( ) + Environment.NewLine + "Time Elapsed = " + koSW.ElapsedMilliseconds.ToString( )); }
public void Apply(int numSystems, int n, deviceptr<double> dl, deviceptr<double> dd, deviceptr<double> du, deviceptr<double> db, deviceptr<double> dx) { var sharedSize = 9*n*sizeof (double); var lp = new LaunchParam(numSystems, n, sharedSize); this.GPULaunch(this.Kernel, lp, n, dl, dd, du, db, dx); }
//[/cuRANDComputeValue] //[cuRANDPiEstimator] public double RunEstimation(int numSims, int threadBlockSize) { // Aim to launch around ten or more times as many blocks as there // are multiprocessors on the target device. const int blocksPerSm = 10; var numSMs = GPUWorker.Device.Attributes.MULTIPROCESSOR_COUNT; // Determine how to divide the work between cores var block = new dim3(threadBlockSize); var grid = new dim3((numSims + threadBlockSize - 1) / threadBlockSize); while (grid.x > 2 * blocksPerSm * numSims) grid.x >>= 1; var n = 2 * numSims; using (var dPoints = GPUWorker.Malloc<double>(n)) using (var dResults = GPUWorker.Malloc<double>(grid.x)) { // Generate random points in unit square var curand = new CURAND(GPUWorker, CURANDInterop.curandRngType.CURAND_RNG_QUASI_SOBOL64); curand.SetQuasiRandomGeneratorDimensions(2); curand.SetGeneratorOrdering(CURANDInterop.curandOrdering.CURAND_ORDERING_QUASI_DEFAULT); curand.GenerateUniformDouble(dPoints.Ptr, new IntPtr(n)); var lp = new LaunchParam(grid, block, block.x * sizeof(uint)); GPULaunch(ComputeValue, lp, dResults.Ptr, dPoints.Ptr, numSims); var value = dResults.Gather().Sum(); return (value/numSims)*4.0; } }