private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var rows = src.Sizes[0]; var cols = src.Sizes[1]; var ndim = src.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLNormalization", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); long rows = src1.Sizes[0]; long cols = src1.Sizes[1]; int ndim = src1.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src1.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); long rows = weight.Sizes[0]; long cols = weight.Sizes[1]; int ndim = weight.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr cachePtr = CudaHelpers.GetBufferStart(cache); Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
/// <summary> /// It gets the height of the parent (if it has one) and replaces the widget's height with it, then updates the spaces of its children. /// </summary> /// <param name="widget"></param> public void ApplyOn(IWidget widget) { ApplicationDone = false; if (widget.IsRoot) { return; } var(x, y, w, _) = widget.Space; if (!widget.Props.Contains <HorizontalStretch>() && widget.Children.Any()) { w = widget.Children.Max(c => c.Space.Width); } var(nh, nonFinishedHorSiblings) = ApplyUtils.StretchedHeightUsingHeightSiblings(widget); nonFinishedHorSiblings.ForEach(s => { var hp = (IApplicableProp)s.Props.GetByProp <HorizontalStretch>().First(); hp.Applied += (_, _) => { ApplicationDone = false; WidgetsSpaceHelper.UpdateSpace(widget, new RectangleF(x, y, w, nh - s.Space.Height)); ApplicationDone = true; }; }); WidgetsSpaceHelper.UpdateSpace(widget, new RectangleF(x, y, w, nh - widget.Margins.Top - widget.Margins.Bottom)); ApplicationDone = nonFinishedHorSiblings.Count == 0; OnApplied(); }
public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args) { ThrowIfAnyTensorInvalid(args); cudaContext.SetCurrent(); CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext); IEnumerable <Tensor> allTensors = args.OfType <Tensor>(); Tensor firstTensor = allTensors.First(); long elementCount = firstTensor.ElementCount(); ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray()); ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args); ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock(); ManagedCuda.VectorTypes.dim3 grid = ApplyUtils.GetApplyGrid(deviceInfo, elementCount); string fullKernelName = PermutationGenerator.GetMangledName(baseName, spec); CudaKernel kernel = context.KernelCache.Get(cudaContext, ptx, fullKernelName); kernel.GridDimensions = grid; kernel.BlockDimensions = block; kernel.RunAsync(CUstream.NullStream, args); }
private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); var ndim = src.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); var cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var resultPtr = CudaHelpers.GetBufferStart(result); var srcPtr = CudaHelpers.GetBufferStart(src); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); this.Invoke(context, cudaContext, "gLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps); }
private static dim3 GetContigReduceBlock(CudaContext cudaContext, long numSlices, long reductionSize) { // If the number of slices is low but the reduction dimension size // is high, then we should increase block size for greater parallelism. // Aim for at least 32 warps per SM (assume 15 SMs; don't bother // inquiring the real number for now). var smCount = 15; var maxWarps = 4; // better occupancy if many blocks are around // For numSlices > smCount * 8, there are > 32 warps active per SM. if (numSlices < smCount * 8) { maxWarps = 8; if (numSlices < smCount * 4) { maxWarps = 16; if (numSlices < smCount * 2) { maxWarps = 32; } } } // Scale up block size based on the reduction dimension size var warpsInReductionSize = ApplyUtils.CeilDiv(reductionSize, 32); var numWarps = warpsInReductionSize > maxWarps ? maxWarps : (int)warpsInReductionSize; var targetSize = numWarps * 32; targetSize = Math.Min(targetSize, (int)cudaContext.GetDeviceInfo().MaxBlockDim.x); return(new dim3(targetSize)); }
private void IndexSelectGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor indice) { CudaContext cudaContext = context.CudaContextForTensor(adj); cudaContext.SetCurrent(); int ndim = adj.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(adj.Sizes, adj.Strides); long cols = adj.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr indicePtr = CudaHelpers.GetBufferStart(indice); Invoke(context, cudaContext, "IndexSelectGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, indicePtr, rows, cols); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); int ndim = src.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides); long cols = src.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
//__global__ void SGD(float* w, float* g, float* c, float* l, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) private void SGD(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); var lrwPtr = CudaHelpers.GetBufferStart(lrw); Invoke(context, cudaContext, "SGD", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, weightPtr, gradientPtr, cachePtr, lrwPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var rows = grad.Sizes[0]; var cols = grad.Sizes[1]; var ndim = grad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols); }
/// <summary> /// Move the widgets to the bottom of a Row or right of a Column. /// </summary> /// <param name="widget"></param> public void ApplyOn(IWidget widget) { ApplicationDone = false; ApplyUtils.ApplyIfThereAreChildren(widget, $"{widget} has no children to item-base.", () => { var rowsAtBase = PutAtBase(widget, l => l.Height, GridHelper.WidgetHeight, (y, c) => new Vector2(c.Space.X, y + c.Margins.Top)); var colsAtBase = PutAtBase(widget, l => l.Width, GridHelper.WidgetWidth, (x, c) => new Vector2(x + c.Margins.Left, c.Space.Y)); if (ApplyUtils.TryExtractRows(widget, out var rows)) { rowsAtBase(rows); } else if (ApplyUtils.TryExtractColumns(widget, out var cols)) { colsAtBase(cols); } else { Log.Error( "ItemBase can only be applied to a Row or Column Widget! Make sure this {W} has a Row or Column Prop", widget.ToString()); throw new IncompatibleWidgetException( "Tried to apply ItemBase to a widget without a Row or Column Prop"); } }); ApplicationDone = true; OnApplied(); }
private void BuildPadSelfTriMask(TSCudaContext context, Tensor mask, Tensor originalLengths, int batchSize) { CudaContext cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); int ndim = mask.DimensionCount; long rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } long cols = mask.Sizes[ndim - 1]; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr maskPtr = CudaHelpers.GetBufferStart(mask); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildPadSelfTriMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalLengthsPtr, batchSize, rows, cols); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var rows = weight.Sizes[0]; var cols = weight.Sizes[1]; var ndim = weight.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= weight.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var vPtr = CudaHelpers.GetBufferStart(v); var mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
private void UpdateCost(TSCudaContext context, Tensor weight, Tensor ids, Tensor costs) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var ndim = weight.DimensionCount; long rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { rows *= weight.Sizes[dim]; } var cols = weight.Sizes[ndim - 1]; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var idsPtr = CudaHelpers.GetBufferStart(ids); var costsPtr = CudaHelpers.GetBufferStart(costs); this.Invoke(context, cudaContext, "UpdateCost", grid, threads, 0, CUstream.NullStream, weightPtr, idsPtr, costsPtr, rows, cols); }
private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps) { CudaContext cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); int ndim = weight.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); long cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight); CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient); CUdeviceptr vPtr = CudaHelpers.GetBufferStart(v); CUdeviceptr mPtr = CudaHelpers.GetBufferStart(m); Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var ndim = inGrad.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); var cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); var out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var x1Ptr = CudaHelpers.GetBufferStart(x1); var x2Ptr = CudaHelpers.GetBufferStart(x2); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); this.Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); int ndim = inGrad.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides); long cols = inGrad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr outGradPtr = CudaHelpers.GetBufferStart(outGrad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr xPtr = CudaHelpers.GetBufferStart(x); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps); }
/// <summary> /// Move the widgets to the vertical center of a Row or to the horizontal center of a Column. /// </summary> /// <param name="widget"></param> public void ApplyOn(IWidget widget) { ApplicationDone = false; ApplyUtils.ApplyIfThereAreChildren(widget, $"{widget} has no children to item-center.", () => { if (ApplyUtils.TryExtractRows(widget, out var rows)) { CenterHelper.ItemCenterVertical(widget, rows); } else if (ApplyUtils.TryExtractColumns(widget, out var cols)) { CenterHelper.ItemCenterHorizontal(widget, cols); } else { Log.Error( "ItemCenter can only be applied to a Row or Column Widget! Make sure this {W} has a Row or Column Prop", widget.ToString()); throw new IncompatibleWidgetException( "Tried to apply ItemCenter to a widget without a Row or Column Prop"); } }); ApplicationDone = false; OnApplied(); }
private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(src1); cudaContext.SetCurrent(); int ndim = src1.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(src1.Sizes, src1.Strides); long cols = src1.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr src1Ptr = CudaHelpers.GetBufferStart(src1); CUdeviceptr src2Ptr = CudaHelpers.GetBufferStart(src2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private static dim3 GridFromTiles(long gridTiles) { if (gridTiles > MaxGridSize * MaxGridSize * MaxGridSize) { throw new ArgumentException("gridTiles exceeds the maximum allowed tile count", nameof(gridTiles)); } var gridX = gridTiles > MaxGridSize ? MaxGridSize : gridTiles; long gridY = 1; long gridZ = 1; if (gridTiles > MaxGridSize) { gridTiles = ApplyUtils.CeilDiv(gridTiles, MaxGridSize); gridY = gridTiles > MaxGridSize ? MaxGridSize : gridTiles; if (gridTiles > MaxGridSize) { gridTiles = ApplyUtils.CeilDiv(gridTiles, MaxGridSize); gridZ = gridTiles > MaxGridSize ? MaxGridSize : gridTiles; } } return(new dim3((uint)gridX, (uint)gridY, (uint)gridZ)); }
private void BuildSelfTriMask(TSCudaContext context, Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue) { CudaContext cudaContext = context.CudaContextForTensor(originalLengths); cudaContext.SetCurrent(); int ndim = result.DimensionCount; long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides); long cols = result.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } long rows = storageSize / cols; dim3 threads = new dim3((uint)Math.Min(512, rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths); Invoke(context, cudaContext, "BuildSelfTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, originalLengthsPtr, rows, cols, paddedSeqLen, value, maskedValue); }
/// <summary> /// It gets the width of the parent (if it has one) and replaces the widget's width with it, then updates the spaces of its children. /// </summary> /// <param name="widget"></param> public void ApplyOn(IWidget widget) { ApplicationDone = false; if (widget.IsRoot) { return; } var(x, y, _, h) = widget.Space; if (!widget.Props.Contains <VerticalStretch>() && widget.Children.Any()) { h = widget.Children.Max(c => c.TotalSpaceOccupied.Height); } var(nw, nonFinishedHorSiblings) = ApplyUtils.StretchedWidthUsingWidthSiblings(widget); nonFinishedHorSiblings.ForEach(s => { var hp = (IApplicableProp)s.Props.GetByProp <HorizontalStretch>().First(); hp.Applied += (_, _) => { ApplicationDone = false; WidgetsSpaceHelper.UpdateSpace(widget, new RectangleF(x, y, nw - s.Space.Width, h)); ApplicationDone = true; }; }); WidgetsSpaceHelper.UpdateSpace(widget, new RectangleF(x, y, nw - widget.Margins.Left - widget.Margins.Right, h)); ApplicationDone = true; OnApplied(); }
private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps) { var cudaContext = context.CudaContextForTensor(weight); cudaContext.SetCurrent(); var ndim = weight.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides); var cols = weight.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var weightPtr = CudaHelpers.GetBufferStart(weight); var gradientPtr = CudaHelpers.GetBufferStart(gradient); var cachePtr = CudaHelpers.GetBufferStart(cache); this.Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps); }
private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f) { CudaContext cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); long rows = inGrad.Sizes[0]; long cols = inGrad.Sizes[1]; int ndim = inGrad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr out1GradPtr = CudaHelpers.GetBufferStart(out1Grad); CUdeviceptr out2GradPtr = CudaHelpers.GetBufferStart(out2Grad); CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); CUdeviceptr betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); CUdeviceptr inGradPtr = CudaHelpers.GetBufferStart(inGrad); CUdeviceptr yPtr = CudaHelpers.GetBufferStart(y); CUdeviceptr x1Ptr = CudaHelpers.GetBufferStart(x1); CUdeviceptr x2Ptr = CudaHelpers.GetBufferStart(x2); CUdeviceptr alphaPtr = CudaHelpers.GetBufferStart(alpha); CUdeviceptr betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { var cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); var ndim = grad.DimensionCount; var storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides); var cols = grad.Sizes[ndim - 1]; if (storageSize % cols != 0) { throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'"); } var rows = storageSize / cols; var iAddGrad = addGrad ? 1 : 0; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var gradPtr = CudaHelpers.GetBufferStart(grad); var adjPtr = CudaHelpers.GetBufferStart(adj); var valPtr = CudaHelpers.GetBufferStart(val); this.Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
private void Softmax(TSCudaContext context, Tensor result, Tensor src) { CudaContext cudaContext = context.CudaContextForTensor(src); cudaContext.SetCurrent(); long rows = src.Sizes[0]; long cols = src.Sizes[1]; int ndim = src.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= src.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result); CUdeviceptr srcPtr = CudaHelpers.GetBufferStart(src); Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols); }
//BuildSrcTgtMask(float* weights, int* originalSrcLengths, int* originalTgtLengths, int batchSize, unsigned rows, unsigned cols) private void BuildSrcTgtMask(TSCudaContext context, Tensor mask, Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize) { var cudaContext = context.CudaContextForTensor(mask); cudaContext.SetCurrent(); var ndim = mask.DimensionCount; long rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { rows *= mask.Sizes[dim]; } var cols = mask.Sizes[ndim - 1]; var threads = new dim3((uint)Math.Min(512, rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y))); var maskPtr = CudaHelpers.GetBufferStart(mask); var originalSrcLengthsPtr = CudaHelpers.GetBufferStart(originalSrcLengths); var originalTgtLengthsPtr = CudaHelpers.GetBufferStart(originalTgtLengths); this.Invoke(context, cudaContext, "BuildSrcTgtMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalSrcLengthsPtr, originalTgtLengthsPtr, batchSize, rows, cols); }
private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true) { CudaContext cudaContext = context.CudaContextForTensor(grad); cudaContext.SetCurrent(); long rows = grad.Sizes[0]; long cols = grad.Sizes[1]; int iAddGrad = addGrad ? 1 : 0; int ndim = grad.DimensionCount; long num_rows = 1; for (int dim = 0; dim < ndim - 1; dim++) { num_rows *= grad.Sizes[dim]; } dim3 threads = new dim3((uint)Math.Min(512, num_rows)); dim3 grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad); CUdeviceptr adjPtr = CudaHelpers.GetBufferStart(adj); CUdeviceptr valPtr = CudaHelpers.GetBufferStart(val); Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad); }
private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f) { var cudaContext = context.CudaContextForTensor(inGrad); cudaContext.SetCurrent(); var rows = inGrad.Sizes[0]; var cols = inGrad.Sizes[1]; var ndim = inGrad.DimensionCount; long num_rows = 1; for (var dim = 0; dim < ndim - 1; dim++) { num_rows *= inGrad.Sizes[dim]; } var threads = new dim3((uint)Math.Min(512, num_rows)); var grid = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y))); var outGradPtr = CudaHelpers.GetBufferStart(outGrad); var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad); var betaGradPtr = CudaHelpers.GetBufferStart(betaGrad); var inGradPtr = CudaHelpers.GetBufferStart(inGrad); var yPtr = CudaHelpers.GetBufferStart(y); var xPtr = CudaHelpers.GetBufferStart(x); var alphaPtr = CudaHelpers.GetBufferStart(alpha); var betaPtr = CudaHelpers.GetBufferStart(beta); Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, (uint)(threads.x * sizeof(float)) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps); }