Beispiel #1
0
        private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            var rows = src.Sizes[0];
            var cols = src.Sizes[1];

            var  ndim     = src.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var resultPtr = CudaHelpers.GetBufferStart(result);
            var srcPtr    = CudaHelpers.GetBufferStart(src);
            var alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            var betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gLNormalization", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #2
0
        private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src1);

            cudaContext.SetCurrent();

            long rows = src1.Sizes[0];
            long cols = src1.Sizes[1];

            int  ndim     = src1.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src1.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr src1Ptr   = CudaHelpers.GetBufferStart(src1);
            CUdeviceptr src2Ptr   = CudaHelpers.GetBufferStart(src2);
            CUdeviceptr alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #3
0
        private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            CudaContext cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            long rows = weight.Sizes[0];
            long cols = weight.Sizes[1];

            int  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr weightPtr   = CudaHelpers.GetBufferStart(weight);
            CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient);
            CUdeviceptr cachePtr    = CudaHelpers.GetBufferStart(cache);

            Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Beispiel #4
0
        /// <summary>
        /// It gets the height of the parent (if it has one) and replaces the widget's height with it, then updates the spaces of its children.
        /// </summary>
        /// <param name="widget"></param>
        public void ApplyOn(IWidget widget)
        {
            ApplicationDone = false;
            if (widget.IsRoot)
            {
                return;
            }
            var(x, y, w, _) = widget.Space;

            if (!widget.Props.Contains <HorizontalStretch>() && widget.Children.Any())
            {
                w = widget.Children.Max(c => c.Space.Width);
            }

            var(nh, nonFinishedHorSiblings) = ApplyUtils.StretchedHeightUsingHeightSiblings(widget);
            nonFinishedHorSiblings.ForEach(s =>
            {
                var hp      = (IApplicableProp)s.Props.GetByProp <HorizontalStretch>().First();
                hp.Applied += (_, _) =>
                {
                    ApplicationDone = false;
                    WidgetsSpaceHelper.UpdateSpace(widget, new RectangleF(x, y, w, nh - s.Space.Height));
                    ApplicationDone = true;
                };
            });

            WidgetsSpaceHelper.UpdateSpace(widget,
                                           new RectangleF(x, y, w, nh - widget.Margins.Top - widget.Margins.Bottom));
            ApplicationDone = nonFinishedHorSiblings.Count == 0;
            OnApplied();
        }
        public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            cudaContext.SetCurrent();

            CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext);

            IEnumerable <Tensor> allTensors = args.OfType <Tensor>();
            Tensor firstTensor       = allTensors.First();
            long   elementCount      = firstTensor.ElementCount();
            ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray());

            ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

            ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock();
            ManagedCuda.VectorTypes.dim3 grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

            string     fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
            CudaKernel kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

            kernel.GridDimensions  = grid;
            kernel.BlockDimensions = block;
            kernel.RunAsync(CUstream.NullStream, args);
        }
Beispiel #6
0
        private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            var ndim        = src.DimensionCount;
            var storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides);
            var cols        = src.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            var rows = storageSize / cols;


            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var resultPtr = CudaHelpers.GetBufferStart(result);
            var srcPtr    = CudaHelpers.GetBufferStart(src);
            var alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            var betaPtr   = CudaHelpers.GetBufferStart(beta);

            this.Invoke(context, cudaContext, "gLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #7
0
        private static dim3 GetContigReduceBlock(CudaContext cudaContext, long numSlices, long reductionSize)
        {
            // If the number of slices is low but the reduction dimension size
            // is high, then we should increase block size for greater parallelism.
            // Aim for at least 32 warps per SM (assume 15 SMs; don't bother
            // inquiring the real number for now).
            var smCount  = 15;
            var maxWarps = 4; // better occupancy if many blocks are around

            // For numSlices > smCount * 8, there are > 32 warps active per SM.
            if (numSlices < smCount * 8)
            {
                maxWarps = 8;
                if (numSlices < smCount * 4)
                {
                    maxWarps = 16;
                    if (numSlices < smCount * 2)
                    {
                        maxWarps = 32;
                    }
                }
            }

            // Scale up block size based on the reduction dimension size
            var warpsInReductionSize = ApplyUtils.CeilDiv(reductionSize, 32);
            var numWarps             =
                warpsInReductionSize > maxWarps ? maxWarps : (int)warpsInReductionSize;

            var targetSize = numWarps * 32;

            targetSize = Math.Min(targetSize, (int)cudaContext.GetDeviceInfo().MaxBlockDim.x);
            return(new dim3(targetSize));
        }
Beispiel #8
0
        private void IndexSelectGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor indice)
        {
            CudaContext cudaContext = context.CudaContextForTensor(adj);

            cudaContext.SetCurrent();

            int  ndim        = adj.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(adj.Sizes, adj.Strides);
            long cols        = adj.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr gradPtr   = CudaHelpers.GetBufferStart(grad);
            CUdeviceptr adjPtr    = CudaHelpers.GetBufferStart(adj);
            CUdeviceptr indicePtr = CudaHelpers.GetBufferStart(indice);


            Invoke(context, cudaContext, "IndexSelectGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, indicePtr, rows, cols);
        }
Beispiel #9
0
        private void Softmax(TSCudaContext context, Tensor result, Tensor src)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            int  ndim        = src.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides);
            long cols        = src.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr srcPtr    = CudaHelpers.GetBufferStart(src);

            Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols);
        }
Beispiel #10
0
        //__global__ void SGD(float* w, float* g, float* c, float* l, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        private void SGD(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var rows = weight.Sizes[0];
            var cols = weight.Sizes[1];

            var  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var weightPtr   = CudaHelpers.GetBufferStart(weight);
            var gradientPtr = CudaHelpers.GetBufferStart(gradient);
            var cachePtr    = CudaHelpers.GetBufferStart(cache);
            var lrwPtr      = CudaHelpers.GetBufferStart(lrw);

            Invoke(context, cudaContext, "SGD", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, weightPtr, gradientPtr, cachePtr, lrwPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Beispiel #11
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val)
        {
            var cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            var rows = grad.Sizes[0];
            var cols = grad.Sizes[1];

            var  ndim     = grad.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= grad.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var gradPtr = CudaHelpers.GetBufferStart(grad);
            var adjPtr  = CudaHelpers.GetBufferStart(adj);
            var valPtr  = CudaHelpers.GetBufferStart(val);

            Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols);
        }
Beispiel #12
0
        /// <summary>
        /// Move the widgets to the bottom of a Row or right of a Column.
        /// </summary>
        /// <param name="widget"></param>
        public void ApplyOn(IWidget widget)
        {
            ApplicationDone = false;
            ApplyUtils.ApplyIfThereAreChildren(widget, $"{widget} has no children to item-base.",
                                               () =>
            {
                var rowsAtBase =
                    PutAtBase(widget, l => l.Height, GridHelper.WidgetHeight,
                              (y, c) => new Vector2(c.Space.X, y + c.Margins.Top));
                var colsAtBase =
                    PutAtBase(widget, l => l.Width, GridHelper.WidgetWidth,
                              (x, c) => new Vector2(x + c.Margins.Left, c.Space.Y));

                if (ApplyUtils.TryExtractRows(widget, out var rows))
                {
                    rowsAtBase(rows);
                }
                else if (ApplyUtils.TryExtractColumns(widget, out var cols))
                {
                    colsAtBase(cols);
                }
                else
                {
                    Log.Error(
                        "ItemBase can only be applied to a Row or Column Widget! Make sure this {W} has a Row or Column Prop",
                        widget.ToString());
                    throw new IncompatibleWidgetException(
                        "Tried to apply ItemBase to a widget without a Row or Column Prop");
                }
            });
            ApplicationDone = true;
            OnApplied();
        }
Beispiel #13
0
        private void BuildPadSelfTriMask(TSCudaContext context, Tensor mask, Tensor originalLengths, int batchSize)
        {
            CudaContext cudaContext = context.CudaContextForTensor(mask);

            cudaContext.SetCurrent();

            int  ndim = mask.DimensionCount;
            long rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                rows *= mask.Sizes[dim];
            }

            long cols = mask.Sizes[ndim - 1];

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));


            CUdeviceptr maskPtr            = CudaHelpers.GetBufferStart(mask);
            CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths);

            Invoke(context, cudaContext, "BuildPadSelfTriMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalLengthsPtr, batchSize, rows, cols);
        }
Beispiel #14
0
        private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var rows = weight.Sizes[0];
            var cols = weight.Sizes[1];

            var  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var weightPtr   = CudaHelpers.GetBufferStart(weight);
            var gradientPtr = CudaHelpers.GetBufferStart(gradient);
            var vPtr        = CudaHelpers.GetBufferStart(v);
            var mPtr        = CudaHelpers.GetBufferStart(m);

            Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps);
        }
Beispiel #15
0
        private void UpdateCost(TSCudaContext context, Tensor weight, Tensor ids, Tensor costs)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var  ndim = weight.DimensionCount;
            long rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                rows *= weight.Sizes[dim];
            }

            var cols = weight.Sizes[ndim - 1];

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var weightPtr = CudaHelpers.GetBufferStart(weight);
            var idsPtr    = CudaHelpers.GetBufferStart(ids);
            var costsPtr  = CudaHelpers.GetBufferStart(costs);

            this.Invoke(context, cudaContext, "UpdateCost", grid, threads, 0, CUstream.NullStream, weightPtr, idsPtr, costsPtr, rows, cols);
        }
Beispiel #16
0
        private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps)
        {
            CudaContext cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            int  ndim        = weight.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides);
            long cols        = weight.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr weightPtr   = CudaHelpers.GetBufferStart(weight);
            CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient);
            CUdeviceptr vPtr        = CudaHelpers.GetBufferStart(v);
            CUdeviceptr mPtr        = CudaHelpers.GetBufferStart(m);

            Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps);
        }
Beispiel #17
0
        private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();
            var ndim        = inGrad.DimensionCount;
            var storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides);
            var cols        = inGrad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            var rows = storageSize / cols;

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var out1GradPtr  = CudaHelpers.GetBufferStart(out1Grad);
            var out2GradPtr  = CudaHelpers.GetBufferStart(out2Grad);
            var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            var betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            var inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            var yPtr         = CudaHelpers.GetBufferStart(y);
            var x1Ptr        = CudaHelpers.GetBufferStart(x1);
            var x2Ptr        = CudaHelpers.GetBufferStart(x2);
            var alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            var betaPtr      = CudaHelpers.GetBufferStart(beta);

            this.Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #18
0
        private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();

            int  ndim        = inGrad.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides);
            long cols        = inGrad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr outGradPtr   = CudaHelpers.GetBufferStart(outGrad);
            CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            CUdeviceptr betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            CUdeviceptr inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            CUdeviceptr yPtr         = CudaHelpers.GetBufferStart(y);
            CUdeviceptr xPtr         = CudaHelpers.GetBufferStart(x);
            CUdeviceptr alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #19
0
 /// <summary>
 /// Move the widgets to the vertical center of a Row or to the horizontal center of a Column.
 /// </summary>
 /// <param name="widget"></param>
 public void ApplyOn(IWidget widget)
 {
     ApplicationDone = false;
     ApplyUtils.ApplyIfThereAreChildren(widget,
                                        $"{widget} has no children to item-center.",
                                        () =>
     {
         if (ApplyUtils.TryExtractRows(widget, out var rows))
         {
             CenterHelper.ItemCenterVertical(widget, rows);
         }
         else if (ApplyUtils.TryExtractColumns(widget, out var cols))
         {
             CenterHelper.ItemCenterHorizontal(widget, cols);
         }
         else
         {
             Log.Error(
                 "ItemCenter can only be applied to a Row or Column Widget! Make sure this {W} has a Row or Column Prop",
                 widget.ToString());
             throw new IncompatibleWidgetException(
                 "Tried to apply ItemCenter to a widget without a Row or Column Prop");
         }
     });
     ApplicationDone = false;
     OnApplied();
 }
Beispiel #20
0
        private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src1);

            cudaContext.SetCurrent();

            int  ndim        = src1.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(src1.Sizes, src1.Strides);
            long cols        = src1.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr src1Ptr   = CudaHelpers.GetBufferStart(src1);
            CUdeviceptr src2Ptr   = CudaHelpers.GetBufferStart(src2);
            CUdeviceptr alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #21
0
        private static dim3 GridFromTiles(long gridTiles)
        {
            if (gridTiles > MaxGridSize * MaxGridSize * MaxGridSize)
            {
                throw new ArgumentException("gridTiles exceeds the maximum allowed tile count", nameof(gridTiles));
            }

            var  gridX = gridTiles > MaxGridSize ? MaxGridSize : gridTiles;
            long gridY = 1;
            long gridZ = 1;

            if (gridTiles > MaxGridSize)
            {
                gridTiles = ApplyUtils.CeilDiv(gridTiles, MaxGridSize);
                gridY     = gridTiles > MaxGridSize ? MaxGridSize : gridTiles;

                if (gridTiles > MaxGridSize)
                {
                    gridTiles = ApplyUtils.CeilDiv(gridTiles, MaxGridSize);
                    gridZ     = gridTiles > MaxGridSize ? MaxGridSize : gridTiles;
                }
            }

            return(new dim3((uint)gridX, (uint)gridY, (uint)gridZ));
        }
Beispiel #22
0
        private void BuildSelfTriMask(TSCudaContext context, Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue)
        {
            CudaContext cudaContext = context.CudaContextForTensor(originalLengths);

            cudaContext.SetCurrent();

            int  ndim        = result.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides);
            long cols        = result.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr          = CudaHelpers.GetBufferStart(result);
            CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths);


            Invoke(context, cudaContext, "BuildSelfTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, originalLengthsPtr, rows, cols, paddedSeqLen, value, maskedValue);
        }
Beispiel #23
0
        /// <summary>
        /// It gets the width of the parent (if it has one) and replaces the widget's width with it, then updates the spaces of its children.
        /// </summary>
        /// <param name="widget"></param>
        public void ApplyOn(IWidget widget)
        {
            ApplicationDone = false;
            if (widget.IsRoot)
            {
                return;
            }
            var(x, y, _, h) = widget.Space;

            if (!widget.Props.Contains <VerticalStretch>() && widget.Children.Any())
            {
                h = widget.Children.Max(c => c.TotalSpaceOccupied.Height);
            }

            var(nw, nonFinishedHorSiblings) = ApplyUtils.StretchedWidthUsingWidthSiblings(widget);
            nonFinishedHorSiblings.ForEach(s =>
            {
                var hp      = (IApplicableProp)s.Props.GetByProp <HorizontalStretch>().First();
                hp.Applied += (_, _) =>
                {
                    ApplicationDone = false;
                    WidgetsSpaceHelper.UpdateSpace(widget, new RectangleF(x, y, nw - s.Space.Width, h));
                    ApplicationDone = true;
                };
            });
            WidgetsSpaceHelper.UpdateSpace(widget,
                                           new RectangleF(x, y, nw - widget.Margins.Left - widget.Margins.Right, h));
            ApplicationDone = true;
            OnApplied();
        }
Beispiel #24
0
        private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var ndim        = weight.DimensionCount;
            var storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides);
            var cols        = weight.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            var rows = storageSize / cols;

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var weightPtr   = CudaHelpers.GetBufferStart(weight);
            var gradientPtr = CudaHelpers.GetBufferStart(gradient);
            var cachePtr    = CudaHelpers.GetBufferStart(cache);

            this.Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Beispiel #25
0
        private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();

            long rows = inGrad.Sizes[0];
            long cols = inGrad.Sizes[1];

            int  ndim     = inGrad.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= inGrad.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr out1GradPtr  = CudaHelpers.GetBufferStart(out1Grad);
            CUdeviceptr out2GradPtr  = CudaHelpers.GetBufferStart(out2Grad);
            CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            CUdeviceptr betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            CUdeviceptr inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            CUdeviceptr yPtr         = CudaHelpers.GetBufferStart(y);
            CUdeviceptr x1Ptr        = CudaHelpers.GetBufferStart(x1);
            CUdeviceptr x2Ptr        = CudaHelpers.GetBufferStart(x2);
            CUdeviceptr alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #26
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            var cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            var ndim        = grad.DimensionCount;
            var storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides);
            var cols        = grad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            var rows = storageSize / cols;

            var iAddGrad = addGrad ? 1 : 0;

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var gradPtr = CudaHelpers.GetBufferStart(grad);
            var adjPtr  = CudaHelpers.GetBufferStart(adj);
            var valPtr  = CudaHelpers.GetBufferStart(val);

            this.Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad);
        }
Beispiel #27
0
        private void Softmax(TSCudaContext context, Tensor result, Tensor src)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            long rows = src.Sizes[0];
            long cols = src.Sizes[1];

            int  ndim     = src.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr srcPtr    = CudaHelpers.GetBufferStart(src);

            Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols);
        }
Beispiel #28
0
        //BuildSrcTgtMask(float* weights, int* originalSrcLengths, int* originalTgtLengths, int batchSize, unsigned rows, unsigned cols)

        private void BuildSrcTgtMask(TSCudaContext context, Tensor mask, Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize)
        {
            var cudaContext = context.CudaContextForTensor(mask);

            cudaContext.SetCurrent();

            var  ndim = mask.DimensionCount;
            long rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                rows *= mask.Sizes[dim];
            }

            var cols = mask.Sizes[ndim - 1];

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));


            var maskPtr = CudaHelpers.GetBufferStart(mask);
            var originalSrcLengthsPtr = CudaHelpers.GetBufferStart(originalSrcLengths);
            var originalTgtLengthsPtr = CudaHelpers.GetBufferStart(originalTgtLengths);

            this.Invoke(context, cudaContext, "BuildSrcTgtMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalSrcLengthsPtr, originalTgtLengthsPtr, batchSize, rows, cols);
        }
Beispiel #29
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            CudaContext cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            long rows     = grad.Sizes[0];
            long cols     = grad.Sizes[1];
            int  iAddGrad = addGrad ? 1 : 0;

            int  ndim     = grad.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= grad.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad);
            CUdeviceptr adjPtr  = CudaHelpers.GetBufferStart(adj);
            CUdeviceptr valPtr  = CudaHelpers.GetBufferStart(val);

            Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad);
        }
Beispiel #30
0
        private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();

            var rows = inGrad.Sizes[0];
            var cols = inGrad.Sizes[1];

            var  ndim     = inGrad.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= inGrad.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var outGradPtr   = CudaHelpers.GetBufferStart(outGrad);
            var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            var betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            var inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            var yPtr         = CudaHelpers.GetBufferStart(y);
            var xPtr         = CudaHelpers.GetBufferStart(x);
            var alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            var betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, (uint)(threads.x * sizeof(float)) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps);
        }