private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); int ndim = inGrad.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); long cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr outGradPtr = CudaHelpers.GetBufferStart(outGrad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr xPtr = CudaHelpers.GetBufferStart(x); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); int ndim = src1.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src1.Sizes, src1.Strides); long cols = src1.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); int ndim = src.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); long cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); int ndim = weight.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); long cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr vPtr = CudaHelpers.GetBufferStart(v); CUdeviceptr mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var rows = grad.Sizes[0]; var cols = grad.Sizes[1]; var ndim = grad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols); }
private void IndexSelectGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor indice) { CudaContext cudaContext = context.CudaContextForTensor(adj); cudaContext.SetCurrent(); int ndim = adj.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(adj.Sizes, adj.Strides); long cols = adj.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr indicePtr = CudaHelpers.GetBufferStart(indice); Invoke(context, cudaContext, "IndexSelectGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, indicePtr, rows, cols); }
//__global__ void SGD(float* w, float* g, float* c, float* l, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) private void SGD(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); var lrwPtr = CudaHelpers.GetBufferStart(lrw); Invoke(context, cudaContext, "SGD", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, weightPtr, gradientPtr, cachePtr, lrwPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void BuildPadSelfTriMask(TSCudaContext context, Tensor mask, Tensor originalLengths, int batchSize) { CudaContext cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); int ndim = mask.DimensionCount; long rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } long cols = mask.Sizes[ndim - 1]; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr maskPtr = CudaHelpers.GetBufferStart(mask); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildPadSelfTriMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalLengthsPtr, batchSize, rows, cols); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var vPtr = CudaHelpers.GetBufferStart(v); var mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
private void BuildSelfTriMask(TSCudaContext context, Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue) { CudaContext cudaContext = context.CudaContextForTensor(originalLengths); cudaContext.SetCurrent(); int ndim = result.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides); long cols = result.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildSelfTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, originalLengthsPtr, rows, cols, paddedSeqLen, value, maskedValue); }
//BuildSrcTgtMask(float* weights, int* originalSrcLengths, int* originalTgtLengths, int batchSize, unsigned rows, unsigned cols) private void BuildSrcTgtMask(TSCudaContext context, Tensor mask, Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize) { var cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); var ndim = mask.DimensionCount; long rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } var cols = mask.Sizes[ndim - 1]; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var maskPtr = CudaHelpers.GetBufferStart(mask); var originalSrcLengthsPtr = CudaHelpers.GetBufferStart(originalSrcLengths); var originalTgtLengthsPtr = CudaHelpers.GetBufferStart(originalTgtLengths); this.Invoke(context, cudaContext, "BuildSrcTgtMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalSrcLengthsPtr, originalTgtLengthsPtr, batchSize, rows, cols); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { CudaContext cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); long rows = grad.Sizes[0]; long cols = grad.Sizes[1]; int iAddGrad = addGrad ? 1 : 0; int ndim = grad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); long rows = src.Sizes[0]; long cols = src.Sizes[1]; int ndim = src.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); long rows = src1.Sizes[0]; long cols = src1.Sizes[1]; int ndim = src1.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src1.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var ndim = weight.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); var cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); this.Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); long rows = weight.Sizes[0]; long cols = weight.Sizes[1]; int ndim = weight.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr cachePtr = CudaHelpers.GetBufferStart(cache); Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private static dim3 GetContigReduceBlock(CudaContext cudaContext, long numSlices, long reductionSize) { // If the number of slices is low but the reduction dimension size // is high, then we should increase block size for greater parallelism. // Aim for at least 32 warps per SM (assume 15 SMs; don't bother // inquiring the real number for now). var smCount = 15; var maxWarps = 4; // better occupancy if many blocks are around // For numSlices > smCount * 8, there are > 32 warps active per SM. if (numSlices < smCount * 8) { maxWarps = 8; if (numSlices < smCount * 4) { maxWarps = 16; if (numSlices < smCount * 2) { maxWarps = 32; } } } // Scale up block size based on the reduction dimension size var warpsInReductionSize = ApplyUtils.CeilDiv(reductionSize, 32); var numWarps = warpsInReductionSize > maxWarps ? maxWarps : (int)warpsInReductionSize; var targetSize = numWarps * 32; targetSize = Math.Min(targetSize, (int)cudaContext.GetDeviceInfo().MaxBlockDim.x); return(new dim3(targetSize)); }
private static dim3 GridFromTiles(long gridTiles) { if (gridTiles > MaxGridSize * MaxGridSize * MaxGridSize) { throw new ArgumentException("gridTiles exceeds the maximum allowed tile count", nameof(gridTiles)); } var gridX = gridTiles > MaxGridSize ? MaxGridSize : gridTiles; long gridY = 1; long gridZ = 1; if (gridTiles > MaxGridSize) { gridTiles = ApplyUtils.CeilDiv(gridTiles, MaxGridSize); gridY = gridTiles > MaxGridSize ? MaxGridSize : gridTiles; if (gridTiles > MaxGridSize) { gridTiles = ApplyUtils.CeilDiv(gridTiles, MaxGridSize); gridZ = gridTiles > MaxGridSize ? MaxGridSize : gridTiles; } } return(new dim3((uint)gridX, (uint)gridY, (uint)gridZ)); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var ndim = inGrad.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); var cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); var out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var x1Ptr = CudaHelpers.GetBufferStart(x1); var x2Ptr = CudaHelpers.GetBufferStart(x2); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); this.Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var ndim = src.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); var cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); this.Invoke(context, cudaContext, "gLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void UpdateCost(TSCudaContext context, Tensor weight, Tensor ids, Tensor costs) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var ndim = weight.DimensionCount; long rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { rows *= weight.Sizes[dim]; } var cols = weight.Sizes[ndim - 1]; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var idsPtr = CudaHelpers.GetBufferStart(ids); var costsPtr = CudaHelpers.GetBufferStart(costs); this.Invoke(context, cudaContext, "UpdateCost", grid, threads, 0, CUstream.NullStream, weightPtr, idsPtr, costsPtr, rows, cols); }
private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var rows = src.Sizes[0]; var cols = src.Sizes[1]; var ndim = src.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLNormalization", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var ndim = grad.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides); var cols = grad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var iAddGrad = addGrad ? 1 : 0; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); this.Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); long rows = inGrad.Sizes[0]; long cols = inGrad.Sizes[1]; int ndim = inGrad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); CUdeviceptr out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr x1Ptr = CudaHelpers.GetBufferStart(x1); CUdeviceptr x2Ptr = CudaHelpers.GetBufferStart(x2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var rows = inGrad.Sizes[0]; var cols = inGrad.Sizes[1]; var ndim = inGrad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var outGradPtr = CudaHelpers.GetBufferStart(outGrad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var xPtr = CudaHelpers.GetBufferStart(x); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, (uint)(threads.x * sizeof(float)) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void SoftmaxMask(TSCudaContext context, Tensor result, Tensor src, Tensor mask) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var ndim = src.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); var cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var maskNdim = mask.DimensionCount; var maskStorageSize = TensorDimensionHelpers.GetStorageSize(mask.Sizes, mask.Strides); var maskCols = mask.Sizes[maskNdim - 1]; if (maskStorageSize % maskCols != 0) { throw new Exception($"Invalid mask tensor storage size = '{maskStorageSize}', and cols = '{maskCols}'"); } var maskRows = maskStorageSize / maskCols; if (rows % maskRows != 0) { throw new Exception($"Invalid tensor rows = '{rows}' and mask tensor rows = '{maskRows}'"); } if (cols != maskCols) { throw new Exception($"Tensor cols = '{cols}', mask tensor cols = '{maskCols}'. They should be equal."); } var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var maskPtr = CudaHelpers.GetBufferStart(mask); this.Invoke(context, cudaContext, "gSoftmaxMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, maskPtr, rows, cols, maskRows); }
private static long getTwoPassBlocks(TSCudaContext context, int deviceId, long elements) { long numBlocks = ApplyUtils.CeilDiv(elements, ReduceAllBlockSize); // We can only have as many blocks as there is scratch space long scratchSpace = context.ScratchSpaceForDevice(deviceId).size / sizeof(float); if (scratchSpace <= 0) { throw new ApplicationException("Device id " + deviceId + " has no scratch space"); } if (numBlocks > scratchSpace) { numBlocks = scratchSpace; } return(numBlocks); }
/// <summary> /// Variables the innermost dim. /// </summary> /// <param name="context">The context.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="normByN">if set to <c>true</c> [norm by n].</param> /// <param name="applySqrt">if set to <c>true</c> [apply SQRT].</param> private void VarInnermostDim(TSCudaContext context, NDArray result, NDArray src, bool normByN, bool applySqrt) { var cudaContext = context.CudaContextForTensor(src); var ndim = src.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Shape[dim]; } var row_size = src.Shape[ndim - 1]; // (Comment from cuTorch source): From limited testing, 16x32 seemed a good compromise for handling both long and short dimensions. var threads = new dim3(16, 32); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var kernelName = "kernel_varInnermostDim" + GetMangledNameSuffix(normByN, applySqrt); Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultPtr, srcPtr, num_rows, row_size); }
private void ReduceIndexInnermostDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, Tuple <float, float> init, string baseKernelName) { CudaContext cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } long row_size = src.Sizes[ndim - 1]; dim3 threads = new dim3(16, 32); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues); CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); string kernelName = "inner_index_" + baseKernelName; Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_rows, row_size, init.Item1, init.Item2); }
/// <summary> /// Variables the outer dim. /// </summary> /// <param name="context">The context.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="dimension">The dimension.</param> /// <param name="normByN">if set to <c>true</c> [norm by n].</param> /// <param name="applySqrt">if set to <c>true</c> [apply SQRT].</param> private void VarOuterDim(TSCudaContext context, NDArray result, NDArray src, int dimension, bool normByN, bool applySqrt) { var cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_orows = 1; for (int dim = 0; dim < dimension; dim++) { num_orows *= src.Shape[dim]; } long row_size = src.Shape[dimension]; // Treat all inner dimensions (i.e. dim > dimension) as one. long num_irows = 1; for (int dim = dimension + 1; dim < ndim; dim++) { num_irows *= src.Shape[dim]; } var threads = new dim3((uint)Math.Min(512, num_irows)); var maxGridDim = 1024; var grid = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var kernelName = "kernel_varOuterDim" + GetMangledNameSuffix(normByN, applySqrt); Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultPtr, srcPtr, num_orows, num_irows, row_size); }