private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var ndim = grad.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides); var cols = grad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var iAddGrad = addGrad ? 1 : 0; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); this.Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
//BuildSrcTgtMask(float* weights, int* originalSrcLengths, int* originalTgtLengths, int batchSize, unsigned rows, unsigned cols) private void BuildSrcTgtMask(TSCudaContext context, Tensor mask, Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize) { var cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); var ndim = mask.DimensionCount; long rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } var cols = mask.Sizes[ndim - 1]; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var maskPtr = CudaHelpers.GetBufferStart(mask); var originalSrcLengthsPtr = CudaHelpers.GetBufferStart(originalSrcLengths); var originalTgtLengthsPtr = CudaHelpers.GetBufferStart(originalTgtLengths); this.Invoke(context, cudaContext, "BuildSrcTgtMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalSrcLengthsPtr, originalTgtLengthsPtr, batchSize, rows, cols); }
private void BuildSelfTriMask(TSCudaContext context, Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue) { CudaContext cudaContext = context.CudaContextForTensor(originalLengths); cudaContext.SetCurrent(); int ndim = result.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides); long cols = result.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildSelfTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, originalLengthsPtr, rows, cols, paddedSeqLen, value, maskedValue); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var ndim = weight.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); var cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); this.Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); int ndim = inGrad.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); long cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr outGradPtr = CudaHelpers.GetBufferStart(outGrad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr xPtr = CudaHelpers.GetBufferStart(x); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); int ndim = src1.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src1.Sizes, src1.Strides); long cols = src1.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); int ndim = src.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); long cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var ndim = src.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); var cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); this.Invoke(context, cudaContext, "gLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var rows = grad.Sizes[0]; var cols = grad.Sizes[1]; var ndim = grad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols); }
private void IndexSelectGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor indice) { CudaContext cudaContext = context.CudaContextForTensor(adj); cudaContext.SetCurrent(); int ndim = adj.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(adj.Sizes, adj.Strides); long cols = adj.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr indicePtr = CudaHelpers.GetBufferStart(indice); Invoke(context, cudaContext, "IndexSelectGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, indicePtr, rows, cols); }
//__global__ void SGD(float* w, float* g, float* c, float* l, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) private void SGD(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); var lrwPtr = CudaHelpers.GetBufferStart(lrw); Invoke(context, cudaContext, "SGD", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, weightPtr, gradientPtr, cachePtr, lrwPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void BuildPadSelfTriMask(TSCudaContext context, Tensor mask, Tensor originalLengths, int batchSize) { CudaContext cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); int ndim = mask.DimensionCount; long rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } long cols = mask.Sizes[ndim - 1]; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr maskPtr = CudaHelpers.GetBufferStart(mask); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildPadSelfTriMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalLengthsPtr, batchSize, rows, cols); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var vPtr = CudaHelpers.GetBufferStart(v); var mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var rows = src.Sizes[0]; var cols = src.Sizes[1]; var ndim = src.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLNormalization", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); int ndim = weight.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); long cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr vPtr = CudaHelpers.GetBufferStart(v); CUdeviceptr mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var ndim = inGrad.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); var cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); var out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var x1Ptr = CudaHelpers.GetBufferStart(x1); var x2Ptr = CudaHelpers.GetBufferStart(x2); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); this.Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void UpdateCost(TSCudaContext context, Tensor weight, Tensor ids, Tensor costs) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var ndim = weight.DimensionCount; long rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { rows *= weight.Sizes[dim]; } var cols = weight.Sizes[ndim - 1]; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var idsPtr = CudaHelpers.GetBufferStart(ids); var costsPtr = CudaHelpers.GetBufferStart(costs); this.Invoke(context, cudaContext, "UpdateCost", grid, threads, 0, CUstream.NullStream, weightPtr, idsPtr, costsPtr, rows, cols); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { CudaContext cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); long rows = grad.Sizes[0]; long cols = grad.Sizes[1]; int iAddGrad = addGrad ? 1 : 0; int ndim = grad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var rows = inGrad.Sizes[0]; var cols = inGrad.Sizes[1]; var ndim = inGrad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var outGradPtr = CudaHelpers.GetBufferStart(outGrad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var xPtr = CudaHelpers.GetBufferStart(x); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, (uint)(threads.x * sizeof(float)) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); long rows = src.Sizes[0]; long cols = src.Sizes[1]; int ndim = src.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); long rows = weight.Sizes[0]; long cols = weight.Sizes[1]; int ndim = weight.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr cachePtr = CudaHelpers.GetBufferStart(cache); Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); long rows = inGrad.Sizes[0]; long cols = inGrad.Sizes[1]; int ndim = inGrad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); CUdeviceptr out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr x1Ptr = CudaHelpers.GetBufferStart(x1); CUdeviceptr x2Ptr = CudaHelpers.GetBufferStart(x2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); long rows = src1.Sizes[0]; long cols = src1.Sizes[1]; int ndim = src1.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src1.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private static void Run_M_V_float(TSCudaContext context, Tensor result, Tensor mat, Tensor vec) { // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices) if (mat.Strides[1] != 1) { throw new ArgumentException("lhs must be contiguous in the last dimension"); } using (var blas = context.BlasForTensor(mat)) { var yPtr = CudaHelpers.GetBufferStart(result); var aPtr = CudaHelpers.GetBufferStart(mat); var xPtr = CudaHelpers.GetBufferStart(vec); Operation trans = Operation.Transpose; int m = (int)mat.Sizes[1]; int n = (int)mat.Sizes[0]; int incx = (int)vec.Strides[0]; int lda = (int)mat.Strides[0]; int incy = (int)result.Strides[0]; float alpha = 1; float beta = 0; CudaBlasNativeMethods.cublasSgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy); } }
/// <summary> /// Variables the outer dim. /// </summary> /// <param name="context">The context.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="dimension">The dimension.</param> /// <param name="normByN">if set to <c>true</c> [norm by n].</param> /// <param name="applySqrt">if set to <c>true</c> [apply SQRT].</param> private void VarOuterDim(TSCudaContext context, NDArray result, NDArray src, int dimension, bool normByN, bool applySqrt) { var cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_orows = 1; for (int dim = 0; dim < dimension; dim++) { num_orows *= src.Shape[dim]; } long row_size = src.Shape[dimension]; // Treat all inner dimensions (i.e. dim > dimension) as one. long num_irows = 1; for (int dim = dimension + 1; dim < ndim; dim++) { num_irows *= src.Shape[dim]; } var threads = new dim3((uint)Math.Min(512, num_irows)); var maxGridDim = 1024; var grid = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var kernelName = "kernel_varOuterDim" + GetMangledNameSuffix(normByN, applySqrt); Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultPtr, srcPtr, num_orows, num_irows, row_size); }
private void ReduceIndexOuterDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, int dimension, Tuple <float, float> init, string baseKernelName) { CudaContext cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_orows = 1; for (int dim = 0; dim < dimension; dim++) { num_orows *= src.Sizes[dim]; } long row_size = src.Sizes[dimension]; long num_irows = 1; for (int dim = dimension + 1; dim < ndim; dim++) { num_irows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_irows)); int maxGridDim = 1024; dim3 grid = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x))); CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues); CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); string kernelName = "outer_index_" + baseKernelName; Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_orows, num_irows, row_size, init.Item1, init.Item2); }
/// <summary> /// Col2s the im. /// </summary> /// <param name="col">The col.</param> /// <param name="im">The im.</param> /// <param name="channels">The channels.</param> /// <param name="height">The height.</param> /// <param name="width">The width.</param> /// <param name="patch_h">The patch h.</param> /// <param name="patch_w">The patch w.</param> /// <param name="pad_h">The pad h.</param> /// <param name="pad_w">The pad w.</param> /// <param name="stride_h">The stride h.</param> /// <param name="stride_w">The stride w.</param> /// <param name="dilation_h">The dilation h.</param> /// <param name="dilation_w">The dilation w.</param> public void Col2Im(Tensor col, Tensor im, int channels, int height, int width, int patch_h, int patch_w, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { var context = CudaHelpers.TSContextForTensor(im); var cudaContext = context.CudaContextForTensor(im); int height_col = (height + 2 * pad_h - (dilation_h * (patch_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (patch_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height * width; var data_im = CudaHelpers.GetBufferStart(im); var data_col = CudaHelpers.GetBufferStart(col); // From Torch source: // To avoid involving atomic operations, we will launch one kernel per // bottom dimension, and then in the kernel add up the top dimensions. Invoke(context, cudaContext, "col2im_kernel", new dim3(NNThreads.NumBlocks(num_kernels)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, num_kernels, data_col, height, width, channels, patch_h, patch_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_im); }
/// <summary> /// Im2s the col. /// </summary> /// <param name="im">The im.</param> /// <param name="col">The col.</param> /// <param name="channels">The channels.</param> /// <param name="height">The height.</param> /// <param name="width">The width.</param> /// <param name="ksize_h">The ksize h.</param> /// <param name="ksize_w">The ksize w.</param> /// <param name="pad_h">The pad h.</param> /// <param name="pad_w">The pad w.</param> /// <param name="stride_h">The stride h.</param> /// <param name="stride_w">The stride w.</param> /// <param name="dilation_h">The dilation h.</param> /// <param name="dilation_w">The dilation w.</param> public void Im2Col(Tensor im, Tensor col, int channels, int height, int width, int ksize_h, int ksize_w, int pad_h, int pad_w, int stride_h, int stride_w, int dilation_h, int dilation_w) { var context = CudaHelpers.TSContextForTensor(im); var cudaContext = context.CudaContextForTensor(im); // From Torch source: // We are going to launch channels * height_col * width_col kernels, each // kernel responsible for copying a single-channel grid. int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1; int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1; int num_kernels = channels * height_col * width_col; var data_im = CudaHelpers.GetBufferStart(im); var data_col = CudaHelpers.GetBufferStart(col); Invoke(context, cudaContext, "im2col_kernel", new dim3(NNThreads.NumBlocks(num_kernels)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, num_kernels, data_im, height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w, height_col, width_col, data_col); }
public void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { var context = CudaHelpers.TSContextForTensor(gradOutput); var cudaContext = context.CudaContextForTensor(gradOutput); var dimw = 3; var dimh = 2; var dimc = 1; var nbatch = input.Sizes[0]; var nslices = input.Sizes[dimc]; var iheight = input.Sizes[dimh]; var iwidth = input.Sizes[dimw]; var owidth = gradOutput.Sizes[dimw]; var oheight = gradOutput.Sizes[dimh]; using var gradOutputContig = Ops.AsContiguous(gradOutput); var gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig); var indicesPtr = CudaHelpers.GetBufferStart(indices); var gradInputPtr = CudaHelpers.GetBufferStart(gradInput); var count = (int)input.ElementCount(); this.Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr); }
public void SpatialMaxPoolingForward(Tensor input, Tensor output, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { var context = CudaHelpers.TSContextForTensor(input); var cudaContext = context.CudaContextForTensor(input); var iwidth = input.Sizes[3]; var iheight = input.Sizes[2]; var nInputPlane = input.Sizes[1]; var batchSize = input.Sizes[0]; long owidth; long oheight; if (ceilMode) { // ReSharper disable once ArrangeRedundantParentheses oheight = (long)(Math.Ceiling((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; // ReSharper disable once ArrangeRedundantParentheses owidth = (long)(Math.Ceiling((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } else { // ReSharper disable once ArrangeRedundantParentheses oheight = (long)(Math.Floor((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1; // ReSharper disable once ArrangeRedundantParentheses owidth = (long)(Math.Floor((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1; } if (cd.padW != 0 || cd.padH != 0) { // ensure that the last pooling starts inside the image if ((oheight - 1) * cd.dH >= iheight + cd.padH) { --oheight; } if ((owidth - 1) * cd.dW >= iwidth + cd.padW) { --owidth; } } using var inputContig = Ops.AsContiguous(input); var inputPtr = CudaHelpers.GetBufferStart(inputContig); var outputPtr = CudaHelpers.GetBufferStart(output); var indicesPtr = CudaHelpers.GetBufferStart(indices); var count = (int)output.ElementCount(); this.Invoke(context, cudaContext, "MaxPoolForward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, inputPtr, batchSize, nInputPlane, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, outputPtr, indicesPtr); }