private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); int ndim = src1.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src1.Sizes, src1.Strides); long cols = src1.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void BuildSelfTriMask(TSCudaContext context, Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue) { CudaContext cudaContext = context.CudaContextForTensor(originalLengths); cudaContext.SetCurrent(); int ndim = result.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides); long cols = result.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildSelfTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, originalLengthsPtr, rows, cols, paddedSeqLen, value, maskedValue); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); int ndim = weight.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); long cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr vPtr = CudaHelpers.GetBufferStart(v); CUdeviceptr mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); int ndim = inGrad.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); long cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr outGradPtr = CudaHelpers.GetBufferStart(outGrad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr xPtr = CudaHelpers.GetBufferStart(x); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var ndim = inGrad.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); var cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); var out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var x1Ptr = CudaHelpers.GetBufferStart(x1); var x2Ptr = CudaHelpers.GetBufferStart(x2); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); this.Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void ReduceIndexOuterDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, int dimension, Tuple <float, float> init, string baseKernelName) { CudaContext cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_orows = 1; for (int dim = 0; dim < dimension; dim++) { num_orows *= src.Sizes[dim]; } long row_size = src.Sizes[dimension]; long num_irows = 1; for (int dim = dimension + 1; dim < ndim; dim++) { num_irows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_irows)); int maxGridDim = 1024; dim3 grid = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x))); CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues); CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); string kernelName = "outer_index_" + baseKernelName; Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_orows, num_irows, row_size, init.Item1, init.Item2); }
//__global__ void SGD(float* w, float* g, float* c, float* l, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) private void SGD(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); var lrwPtr = CudaHelpers.GetBufferStart(lrw); Invoke(context, cudaContext, "SGD", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, weightPtr, gradientPtr, cachePtr, lrwPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
//BuildSrcTgtMask(float* weights, int* originalSrcLengths, int* originalTgtLengths, int batchSize, unsigned rows, unsigned cols) private void BuildSrcTgtMask(TSCudaContext context, Tensor mask, Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize) { var cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); var ndim = mask.DimensionCount; long rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } var cols = mask.Sizes[ndim - 1]; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var maskPtr = CudaHelpers.GetBufferStart(mask); var originalSrcLengthsPtr = CudaHelpers.GetBufferStart(originalSrcLengths); var originalTgtLengthsPtr = CudaHelpers.GetBufferStart(originalTgtLengths); this.Invoke(context, cudaContext, "BuildSrcTgtMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalSrcLengthsPtr, originalTgtLengthsPtr, batchSize, rows, cols); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var vPtr = CudaHelpers.GetBufferStart(v); var mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
private void BuildPadSelfTriMask(TSCudaContext context, Tensor mask, Tensor originalLengths, int batchSize) { CudaContext cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); int ndim = mask.DimensionCount; long rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } long cols = mask.Sizes[ndim - 1]; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr maskPtr = CudaHelpers.GetBufferStart(mask); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildPadSelfTriMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalLengthsPtr, batchSize, rows, cols); }
private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var rows = src.Sizes[0]; var cols = src.Sizes[1]; var ndim = src.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLNormalization", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var rows = inGrad.Sizes[0]; var cols = inGrad.Sizes[1]; var ndim = inGrad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var outGradPtr = CudaHelpers.GetBufferStart(outGrad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var xPtr = CudaHelpers.GetBufferStart(x); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, (uint)(threads.x * sizeof(float)) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps); }
public void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode) { TSCudaContext context = CudaHelpers.TSContextForTensor(gradOutput); CudaContext cudaContext = context.CudaContextForTensor(gradOutput); int dimw = 3; int dimh = 2; int dimc = 1; long nbatch = input.Sizes[0]; long nslices = input.Sizes[dimc]; long iheight = input.Sizes[dimh]; long iwidth = input.Sizes[dimw]; long owidth = gradOutput.Sizes[dimw]; long oheight = gradOutput.Sizes[dimh]; using (Tensor gradOutputContig = Ops.AsContiguous(gradOutput)) { CUdeviceptr gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig); CUdeviceptr indicesPtr = CudaHelpers.GetBufferStart(indices); CUdeviceptr gradInputPtr = CudaHelpers.GetBufferStart(gradInput); int count = (int)input.ElementCount(); Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream, count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth, cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr); } }
public static Tensor Invoke(CudaCode kernels, string funcName, Tensor result, Tensor src) { try { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); Tensor writeTarget = TensorResultBuilder.GetWriteTarget(result, src, false, src.Sizes); long elementCount = writeTarget.ElementCount(); byte[] ptx = kernels.GetPtx(context.Compiler); if (result == src) { ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t1_" + funcName, writeTarget, elementCount); } else { ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t2_" + funcName, writeTarget, src, elementCount); } return(writeTarget); } catch (Exception e) { Logger.WriteLine($"Error = '{e.Message}', Call stack = '{e.StackTrace}'"); throw; } }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var rows = grad.Sizes[0]; var cols = grad.Sizes[1]; var ndim = grad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var ndim = grad.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides); var cols = grad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var iAddGrad = addGrad ? 1 : 0; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); this.Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
private void IndexSelectGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor indice) { CudaContext cudaContext = context.CudaContextForTensor(adj); cudaContext.SetCurrent(); int ndim = adj.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(adj.Sizes, adj.Strides); long cols = adj.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr indicePtr = CudaHelpers.GetBufferStart(indice); Invoke(context, cudaContext, "IndexSelectGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, indicePtr, rows, cols); }
private void UpdateCost(TSCudaContext context, Tensor weight, Tensor ids, Tensor costs) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var ndim = weight.DimensionCount; long rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { rows *= weight.Sizes[dim]; } var cols = weight.Sizes[ndim - 1]; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var idsPtr = CudaHelpers.GetBufferStart(ids); var costsPtr = CudaHelpers.GetBufferStart(costs); this.Invoke(context, cudaContext, "UpdateCost", grid, threads, 0, CUstream.NullStream, weightPtr, idsPtr, costsPtr, rows, cols); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); int ndim = src.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); long cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var ndim = src.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); var cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); this.Invoke(context, cudaContext, "gLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var ndim = weight.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); var cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); this.Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { CudaContext cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); long rows = grad.Sizes[0]; long cols = grad.Sizes[1]; int iAddGrad = addGrad ? 1 : 0; int ndim = grad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
public void CopyGpuToCpu(Tensor result, Tensor src, long totalElements) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext srcContext = context.CudaContextForTensor(src); using (Tensor srcContig = Ops.AsContiguous(src)) using (Tensor resultContig = AsTypeCpu(result, src.ElementType, true)) { IntPtr resultContigPtr = ((Cpu.CpuStorage)resultContig.Storage).PtrAtElement(resultContig.StorageOffset); CUdeviceptr srcContigPtr = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset); long totalBytes = totalElements * srcContig.ElementType.Size(); // Use DriverAPINativeMethods directly here instead of CudaContext.CopyToHost, because CopyToHost only has an overload // for specifying totalBytes as a uint, but we may exceed the range of a uint here. CUResult res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(resultContigPtr, srcContigPtr, totalBytes); if (res != CUResult.Success) { throw new CudaException(res); } if (result.Storage != resultContig.Storage) { Ops.Copy(result, resultContig); // copy on CPU } } }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); long rows = src.Sizes[0]; long cols = src.Sizes[1]; int ndim = src.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); long rows = weight.Sizes[0]; long cols = weight.Sizes[1]; int ndim = weight.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr cachePtr = CudaHelpers.GetBufferStart(cache); Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); long rows = src1.Sizes[0]; long cols = src1.Sizes[1]; int ndim = src1.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src1.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); long rows = inGrad.Sizes[0]; long cols = inGrad.Sizes[1]; int ndim = inGrad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); CUdeviceptr out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr x1Ptr = CudaHelpers.GetBufferStart(x1); CUdeviceptr x2Ptr = CudaHelpers.GetBufferStart(x2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
/// <summary> /// Variables the outer dim. /// </summary> /// <param name="context">The context.</param> /// <param name="result">The result.</param> /// <param name="src">The source.</param> /// <param name="dimension">The dimension.</param> /// <param name="normByN">if set to <c>true</c> [norm by n].</param> /// <param name="applySqrt">if set to <c>true</c> [apply SQRT].</param> private void VarOuterDim(TSCudaContext context, NDArray result, NDArray src, int dimension, bool normByN, bool applySqrt) { var cudaContext = context.CudaContextForTensor(src); int ndim = src.DimensionCount; long num_orows = 1; for (int dim = 0; dim < dimension; dim++) { num_orows *= src.Shape[dim]; } long row_size = src.Shape[dimension]; // Treat all inner dimensions (i.e. dim > dimension) as one. long num_irows = 1; for (int dim = dimension + 1; dim < ndim; dim++) { num_irows *= src.Shape[dim]; } var threads = new dim3((uint)Math.Min(512, num_irows)); var maxGridDim = 1024; var grid = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var kernelName = "kernel_varOuterDim" + GetMangledNameSuffix(normByN, applySqrt); Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultPtr, srcPtr, num_orows, num_irows, row_size); }
public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices) { TSCudaContext context = CudaHelpers.TSContextForTensor(src); CudaContext cudaContext = context.CudaContextForTensor(src); if (result == null) { throw new ArgumentNullException("result"); } if (result.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("result and src must have same number of dimensions"); } if (dim < 0 && dim >= result.DimensionCount) { throw new ArgumentOutOfRangeException("dim"); } if (indices.DimensionCount != src.DimensionCount) { throw new InvalidOperationException("src and indices must have same number of dimensions"); } if (!src.IsSameSizeAs(indices)) { throw new InvalidOperationException("src and indices must be the same size"); } if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim)) { throw new InvalidOperationException("result and src must be the same size except in dimension dim"); } Tensor writeTarget = result; long nElement = indices.ElementCount(); dim3 block = ApplyUtils.GetApplyBlock(); dim3 grid = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement); if (ApplyUtils.CanUse32BitIndexMath(writeTarget) && ApplyUtils.CanUse32BitIndexMath(src) && ApplyUtils.CanUse32BitIndexMath(indices)) { int dims = indices.DimensionCount <= 3 ? indices.DimensionCount : -1; string kernelName = MakeKernelName(ScatterBaseName, true, dims); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true, writeTarget, src, indices, dim, (int)nElement); } else { string kernelName = MakeKernelName(ScatterBaseName, false, -1); Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false, writeTarget, src, indices, dim, nElement); } return(writeTarget); }
public static void Invoke(FillCopyKernels kernels, Tensor result, float value) { TSCudaContext context = CudaHelpers.TSContextForTensor(result); CudaContext cudaContext = context.CudaContextForTensor(result); cudaContext.SetCurrent(); byte[] ptx = kernels.GetPtx(context.Compiler); long elementCount = result.ElementCount(); ApplyOpInvoke.Invoke(context, cudaContext, ptx, "fill", result, value, elementCount); }