Beispiel #1
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            var cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            var ndim        = grad.DimensionCount;
            var storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides);
            var cols        = grad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            var rows = storageSize / cols;

            var iAddGrad = addGrad ? 1 : 0;

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var gradPtr = CudaHelpers.GetBufferStart(grad);
            var adjPtr  = CudaHelpers.GetBufferStart(adj);
            var valPtr  = CudaHelpers.GetBufferStart(val);

            this.Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad);
        }
Beispiel #2
0
        //BuildSrcTgtMask(float* weights, int* originalSrcLengths, int* originalTgtLengths, int batchSize, unsigned rows, unsigned cols)

        private void BuildSrcTgtMask(TSCudaContext context, Tensor mask, Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize)
        {
            var cudaContext = context.CudaContextForTensor(mask);

            cudaContext.SetCurrent();

            var  ndim = mask.DimensionCount;
            long rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                rows *= mask.Sizes[dim];
            }

            var cols = mask.Sizes[ndim - 1];

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));


            var maskPtr = CudaHelpers.GetBufferStart(mask);
            var originalSrcLengthsPtr = CudaHelpers.GetBufferStart(originalSrcLengths);
            var originalTgtLengthsPtr = CudaHelpers.GetBufferStart(originalTgtLengths);

            this.Invoke(context, cudaContext, "BuildSrcTgtMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalSrcLengthsPtr, originalTgtLengthsPtr, batchSize, rows, cols);
        }
Beispiel #3
0
        private void BuildSelfTriMask(TSCudaContext context, Tensor result, Tensor originalLengths, int paddedSeqLen, float value, float maskedValue)
        {
            CudaContext cudaContext = context.CudaContextForTensor(originalLengths);

            cudaContext.SetCurrent();

            int  ndim        = result.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides);
            long cols        = result.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr          = CudaHelpers.GetBufferStart(result);
            CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths);


            Invoke(context, cudaContext, "BuildSelfTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, originalLengthsPtr, rows, cols, paddedSeqLen, value, maskedValue);
        }
Beispiel #4
0
        private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var ndim        = weight.DimensionCount;
            var storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides);
            var cols        = weight.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            var rows = storageSize / cols;

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var weightPtr   = CudaHelpers.GetBufferStart(weight);
            var gradientPtr = CudaHelpers.GetBufferStart(gradient);
            var cachePtr    = CudaHelpers.GetBufferStart(cache);

            this.Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Beispiel #5
0
        private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();

            int  ndim        = inGrad.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides);
            long cols        = inGrad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr outGradPtr   = CudaHelpers.GetBufferStart(outGrad);
            CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            CUdeviceptr betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            CUdeviceptr inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            CUdeviceptr yPtr         = CudaHelpers.GetBufferStart(y);
            CUdeviceptr xPtr         = CudaHelpers.GetBufferStart(x);
            CUdeviceptr alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #6
0
        private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src1);

            cudaContext.SetCurrent();

            int  ndim        = src1.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(src1.Sizes, src1.Strides);
            long cols        = src1.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr src1Ptr   = CudaHelpers.GetBufferStart(src1);
            CUdeviceptr src2Ptr   = CudaHelpers.GetBufferStart(src2);
            CUdeviceptr alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #7
0
        private void Softmax(TSCudaContext context, Tensor result, Tensor src)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            int  ndim        = src.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides);
            long cols        = src.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr srcPtr    = CudaHelpers.GetBufferStart(src);

            Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols);
        }
Beispiel #8
0
        private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            var ndim        = src.DimensionCount;
            var storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides);
            var cols        = src.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            var rows = storageSize / cols;


            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var resultPtr = CudaHelpers.GetBufferStart(result);
            var srcPtr    = CudaHelpers.GetBufferStart(src);
            var alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            var betaPtr   = CudaHelpers.GetBufferStart(beta);

            this.Invoke(context, cudaContext, "gLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #9
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val)
        {
            var cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            var rows = grad.Sizes[0];
            var cols = grad.Sizes[1];

            var  ndim     = grad.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= grad.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var gradPtr = CudaHelpers.GetBufferStart(grad);
            var adjPtr  = CudaHelpers.GetBufferStart(adj);
            var valPtr  = CudaHelpers.GetBufferStart(val);

            Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols);
        }
Beispiel #10
0
        private void IndexSelectGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor indice)
        {
            CudaContext cudaContext = context.CudaContextForTensor(adj);

            cudaContext.SetCurrent();

            int  ndim        = adj.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(adj.Sizes, adj.Strides);
            long cols        = adj.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr gradPtr   = CudaHelpers.GetBufferStart(grad);
            CUdeviceptr adjPtr    = CudaHelpers.GetBufferStart(adj);
            CUdeviceptr indicePtr = CudaHelpers.GetBufferStart(indice);


            Invoke(context, cudaContext, "IndexSelectGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, indicePtr, rows, cols);
        }
Beispiel #11
0
        //__global__ void SGD(float* w, float* g, float* c, float* l, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        private void SGD(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var rows = weight.Sizes[0];
            var cols = weight.Sizes[1];

            var  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var weightPtr   = CudaHelpers.GetBufferStart(weight);
            var gradientPtr = CudaHelpers.GetBufferStart(gradient);
            var cachePtr    = CudaHelpers.GetBufferStart(cache);
            var lrwPtr      = CudaHelpers.GetBufferStart(lrw);

            Invoke(context, cudaContext, "SGD", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, weightPtr, gradientPtr, cachePtr, lrwPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Beispiel #12
0
        private void BuildPadSelfTriMask(TSCudaContext context, Tensor mask, Tensor originalLengths, int batchSize)
        {
            CudaContext cudaContext = context.CudaContextForTensor(mask);

            cudaContext.SetCurrent();

            int  ndim = mask.DimensionCount;
            long rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                rows *= mask.Sizes[dim];
            }

            long cols = mask.Sizes[ndim - 1];

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));


            CUdeviceptr maskPtr            = CudaHelpers.GetBufferStart(mask);
            CUdeviceptr originalLengthsPtr = CudaHelpers.GetBufferStart(originalLengths);

            Invoke(context, cudaContext, "BuildPadSelfTriMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalLengthsPtr, batchSize, rows, cols);
        }
Beispiel #13
0
        private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var rows = weight.Sizes[0];
            var cols = weight.Sizes[1];

            var  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var weightPtr   = CudaHelpers.GetBufferStart(weight);
            var gradientPtr = CudaHelpers.GetBufferStart(gradient);
            var vPtr        = CudaHelpers.GetBufferStart(v);
            var mPtr        = CudaHelpers.GetBufferStart(m);

            Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps);
        }
Beispiel #14
0
        private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            var rows = src.Sizes[0];
            var cols = src.Sizes[1];

            var  ndim     = src.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var resultPtr = CudaHelpers.GetBufferStart(result);
            var srcPtr    = CudaHelpers.GetBufferStart(src);
            var alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            var betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gLNormalization", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #15
0
        private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps)
        {
            CudaContext cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            int  ndim        = weight.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides);
            long cols        = weight.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr weightPtr   = CudaHelpers.GetBufferStart(weight);
            CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient);
            CUdeviceptr vPtr        = CudaHelpers.GetBufferStart(v);
            CUdeviceptr mPtr        = CudaHelpers.GetBufferStart(m);

            Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps);
        }
Beispiel #16
0
        private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();
            var ndim        = inGrad.DimensionCount;
            var storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides);
            var cols        = inGrad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            var rows = storageSize / cols;

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var out1GradPtr  = CudaHelpers.GetBufferStart(out1Grad);
            var out2GradPtr  = CudaHelpers.GetBufferStart(out2Grad);
            var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            var betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            var inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            var yPtr         = CudaHelpers.GetBufferStart(y);
            var x1Ptr        = CudaHelpers.GetBufferStart(x1);
            var x2Ptr        = CudaHelpers.GetBufferStart(x2);
            var alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            var betaPtr      = CudaHelpers.GetBufferStart(beta);

            this.Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #17
0
        private void UpdateCost(TSCudaContext context, Tensor weight, Tensor ids, Tensor costs)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var  ndim = weight.DimensionCount;
            long rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                rows *= weight.Sizes[dim];
            }

            var cols = weight.Sizes[ndim - 1];

            var threads = new dim3((uint)Math.Min(512, rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            var weightPtr = CudaHelpers.GetBufferStart(weight);
            var idsPtr    = CudaHelpers.GetBufferStart(ids);
            var costsPtr  = CudaHelpers.GetBufferStart(costs);

            this.Invoke(context, cudaContext, "UpdateCost", grid, threads, 0, CUstream.NullStream, weightPtr, idsPtr, costsPtr, rows, cols);
        }
Beispiel #18
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            CudaContext cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            long rows     = grad.Sizes[0];
            long cols     = grad.Sizes[1];
            int  iAddGrad = addGrad ? 1 : 0;

            int  ndim     = grad.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= grad.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad);
            CUdeviceptr adjPtr  = CudaHelpers.GetBufferStart(adj);
            CUdeviceptr valPtr  = CudaHelpers.GetBufferStart(val);

            Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad);
        }
Beispiel #19
0
        private void LayerNormGrad(TSCudaContext context, Tensor outGrad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();

            var rows = inGrad.Sizes[0];
            var cols = inGrad.Sizes[1];

            var  ndim     = inGrad.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= inGrad.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var outGradPtr   = CudaHelpers.GetBufferStart(outGrad);
            var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            var betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            var inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            var yPtr         = CudaHelpers.GetBufferStart(y);
            var xPtr         = CudaHelpers.GetBufferStart(x);
            var alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            var betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gLayerNormalizationGrad", grid, threads, (uint)(threads.x * sizeof(float)) * 4, CUstream.NullStream, outGradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, xPtr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #20
0
        private void Softmax(TSCudaContext context, Tensor result, Tensor src)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            long rows = src.Sizes[0];
            long cols = src.Sizes[1];

            int  ndim     = src.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr srcPtr    = CudaHelpers.GetBufferStart(src);

            Invoke(context, cudaContext, "gSoftmax", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, rows, cols);
        }
Beispiel #21
0
        private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            CudaContext cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            long rows = weight.Sizes[0];
            long cols = weight.Sizes[1];

            int  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr weightPtr   = CudaHelpers.GetBufferStart(weight);
            CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient);
            CUdeviceptr cachePtr    = CudaHelpers.GetBufferStart(cache);

            Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Beispiel #22
0
        private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();

            long rows = inGrad.Sizes[0];
            long cols = inGrad.Sizes[1];

            int  ndim     = inGrad.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= inGrad.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr out1GradPtr  = CudaHelpers.GetBufferStart(out1Grad);
            CUdeviceptr out2GradPtr  = CudaHelpers.GetBufferStart(out2Grad);
            CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            CUdeviceptr betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            CUdeviceptr inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            CUdeviceptr yPtr         = CudaHelpers.GetBufferStart(y);
            CUdeviceptr x1Ptr        = CudaHelpers.GetBufferStart(x1);
            CUdeviceptr x2Ptr        = CudaHelpers.GetBufferStart(x2);
            CUdeviceptr alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Beispiel #23
0
        private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src1);

            cudaContext.SetCurrent();

            long rows = src1.Sizes[0];
            long cols = src1.Sizes[1];

            int  ndim     = src1.DimensionCount;
            long num_rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src1.Sizes[dim];
            }

            dim3 threads = new dim3((uint)Math.Min(512, num_rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr src1Ptr   = CudaHelpers.GetBufferStart(src1);
            CUdeviceptr src2Ptr   = CudaHelpers.GetBufferStart(src2);
            CUdeviceptr alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
        private static void Run_M_V_float(TSCudaContext context, Tensor result, Tensor mat, Tensor vec)
        {
            // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices)
            if (mat.Strides[1] != 1)
            {
                throw new ArgumentException("lhs must be contiguous in the last dimension");
            }

            using (var blas = context.BlasForTensor(mat))
            {
                var yPtr = CudaHelpers.GetBufferStart(result);
                var aPtr = CudaHelpers.GetBufferStart(mat);
                var xPtr = CudaHelpers.GetBufferStart(vec);

                Operation trans = Operation.Transpose;
                int       m     = (int)mat.Sizes[1];
                int       n     = (int)mat.Sizes[0];
                int       incx  = (int)vec.Strides[0];
                int       lda   = (int)mat.Strides[0];
                int       incy  = (int)result.Strides[0];
                float     alpha = 1;
                float     beta  = 0;

                CudaBlasNativeMethods.cublasSgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy);
            }
        }
Beispiel #25
0
        /// <summary>
        /// Variables the outer dim.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        /// <param name="dimension">The dimension.</param>
        /// <param name="normByN">if set to <c>true</c> [norm by n].</param>
        /// <param name="applySqrt">if set to <c>true</c> [apply SQRT].</param>
        private void VarOuterDim(TSCudaContext context, NDArray result, NDArray src, int dimension, bool normByN, bool applySqrt)
        {
            var cudaContext = context.CudaContextForTensor(src);

            int  ndim      = src.DimensionCount;
            long num_orows = 1;

            for (int dim = 0; dim < dimension; dim++)
            {
                num_orows *= src.Shape[dim];
            }
            long row_size = src.Shape[dimension];
            // Treat all inner dimensions (i.e. dim > dimension) as one.
            long num_irows = 1;

            for (int dim = dimension + 1; dim < ndim; dim++)
            {
                num_irows *= src.Shape[dim];
            }

            var threads    = new dim3((uint)Math.Min(512, num_irows));
            var maxGridDim = 1024;
            var grid       = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x)));

            var resultPtr  = CudaHelpers.GetBufferStart(result);
            var srcPtr     = CudaHelpers.GetBufferStart(src);
            var kernelName = "kernel_varOuterDim" + GetMangledNameSuffix(normByN, applySqrt);

            Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultPtr, srcPtr, num_orows, num_irows, row_size);
        }
Beispiel #26
0
        private void ReduceIndexOuterDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, int dimension, Tuple <float, float> init, string baseKernelName)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            int  ndim      = src.DimensionCount;
            long num_orows = 1;

            for (int dim = 0; dim < dimension; dim++)
            {
                num_orows *= src.Sizes[dim];
            }
            long row_size  = src.Sizes[dimension];
            long num_irows = 1;

            for (int dim = dimension + 1; dim < ndim; dim++)
            {
                num_irows *= src.Sizes[dim];
            }

            dim3 threads    = new dim3((uint)Math.Min(512, num_irows));
            int  maxGridDim = 1024;
            dim3 grid       = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x)));

            CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues);
            CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices);
            CUdeviceptr srcPtr       = CudaHelpers.GetBufferStart(src);

            string kernelName = "outer_index_" + baseKernelName;

            Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_orows, num_irows, row_size, init.Item1, init.Item2);
        }
Beispiel #27
0
        /// <summary>
        /// Col2s the im.
        /// </summary>
        /// <param name="col">The col.</param>
        /// <param name="im">The im.</param>
        /// <param name="channels">The channels.</param>
        /// <param name="height">The height.</param>
        /// <param name="width">The width.</param>
        /// <param name="patch_h">The patch h.</param>
        /// <param name="patch_w">The patch w.</param>
        /// <param name="pad_h">The pad h.</param>
        /// <param name="pad_w">The pad w.</param>
        /// <param name="stride_h">The stride h.</param>
        /// <param name="stride_w">The stride w.</param>
        /// <param name="dilation_h">The dilation h.</param>
        /// <param name="dilation_w">The dilation w.</param>
        public void Col2Im(Tensor col, Tensor im, int channels, int height, int width,
                           int patch_h, int patch_w, int pad_h,
                           int pad_w, int stride_h, int stride_w,
                           int dilation_h, int dilation_w)
        {
            var context     = CudaHelpers.TSContextForTensor(im);
            var cudaContext = context.CudaContextForTensor(im);


            int height_col = (height + 2 * pad_h - (dilation_h * (patch_h - 1) + 1))
                             / stride_h + 1;
            int width_col = (width + 2 * pad_w - (dilation_w * (patch_w - 1) + 1))
                            / stride_w + 1;
            int num_kernels = channels * height * width;

            var data_im  = CudaHelpers.GetBufferStart(im);
            var data_col = CudaHelpers.GetBufferStart(col);

            // From Torch source:
            // To avoid involving atomic operations, we will launch one kernel per
            // bottom dimension, and then in the kernel add up the top dimensions.

            Invoke(context, cudaContext, "col2im_kernel", new dim3(NNThreads.NumBlocks(num_kernels)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                   num_kernels, data_col, height, width, channels, patch_h, patch_w, pad_h, pad_w, stride_h, stride_w,
                   dilation_h, dilation_w,
                   height_col, width_col, data_im);
        }
Beispiel #28
0
        /// <summary>
        /// Im2s the col.
        /// </summary>
        /// <param name="im">The im.</param>
        /// <param name="col">The col.</param>
        /// <param name="channels">The channels.</param>
        /// <param name="height">The height.</param>
        /// <param name="width">The width.</param>
        /// <param name="ksize_h">The ksize h.</param>
        /// <param name="ksize_w">The ksize w.</param>
        /// <param name="pad_h">The pad h.</param>
        /// <param name="pad_w">The pad w.</param>
        /// <param name="stride_h">The stride h.</param>
        /// <param name="stride_w">The stride w.</param>
        /// <param name="dilation_h">The dilation h.</param>
        /// <param name="dilation_w">The dilation w.</param>
        public void Im2Col(Tensor im, Tensor col, int channels,
                           int height, int width,
                           int ksize_h, int ksize_w, int pad_h,
                           int pad_w, int stride_h, int stride_w,
                           int dilation_h, int dilation_w)
        {
            var context     = CudaHelpers.TSContextForTensor(im);
            var cudaContext = context.CudaContextForTensor(im);

            // From Torch source:
            // We are going to launch channels * height_col * width_col kernels, each
            // kernel responsible for copying a single-channel grid.
            int height_col = (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1))
                             / stride_h + 1;
            int width_col = (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1))
                            / stride_w + 1;
            int num_kernels = channels * height_col * width_col;

            var data_im  = CudaHelpers.GetBufferStart(im);
            var data_col = CudaHelpers.GetBufferStart(col);

            Invoke(context, cudaContext, "im2col_kernel", new dim3(NNThreads.NumBlocks(num_kernels)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                   num_kernels, data_im, height, width, ksize_h, ksize_w,
                   pad_h, pad_w, stride_h, stride_w,
                   dilation_h, dilation_w,
                   height_col, width_col, data_col);
        }
Beispiel #29
0
        public void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode)
        {
            var context     = CudaHelpers.TSContextForTensor(gradOutput);
            var cudaContext = context.CudaContextForTensor(gradOutput);

            var dimw = 3;
            var dimh = 2;
            var dimc = 1;

            var nbatch  = input.Sizes[0];
            var nslices = input.Sizes[dimc];
            var iheight = input.Sizes[dimh];
            var iwidth  = input.Sizes[dimw];
            var owidth  = gradOutput.Sizes[dimw];
            var oheight = gradOutput.Sizes[dimh];

            using var gradOutputContig = Ops.AsContiguous(gradOutput);
            var gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig);
            var indicesPtr    = CudaHelpers.GetBufferStart(indices);
            var gradInputPtr  = CudaHelpers.GetBufferStart(gradInput);

            var count = (int)input.ElementCount();

            this.Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                        count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth,
                        cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr);
        }
Beispiel #30
0
        public void SpatialMaxPoolingForward(Tensor input, Tensor output, Tensor indices, ConvolutionDesc2d cd, bool ceilMode)
        {
            var context     = CudaHelpers.TSContextForTensor(input);
            var cudaContext = context.CudaContextForTensor(input);

            var iwidth      = input.Sizes[3];
            var iheight     = input.Sizes[2];
            var nInputPlane = input.Sizes[1];
            var batchSize   = input.Sizes[0];

            long owidth;
            long oheight;

            if (ceilMode)
            {
                // ReSharper disable once ArrangeRedundantParentheses
                oheight = (long)(Math.Ceiling((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1;
                // ReSharper disable once ArrangeRedundantParentheses
                owidth = (long)(Math.Ceiling((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1;
            }
            else
            {
                // ReSharper disable once ArrangeRedundantParentheses
                oheight = (long)(Math.Floor((float)(iheight - cd.kH + 2 * cd.padH) / cd.dH)) + 1;
                // ReSharper disable once ArrangeRedundantParentheses
                owidth = (long)(Math.Floor((float)(iwidth - cd.kW + 2 * cd.padW) / cd.dW)) + 1;
            }

            if (cd.padW != 0 || cd.padH != 0)
            {
                // ensure that the last pooling starts inside the image
                if ((oheight - 1) * cd.dH >= iheight + cd.padH)
                {
                    --oheight;
                }

                if ((owidth - 1) * cd.dW >= iwidth + cd.padW)
                {
                    --owidth;
                }
            }

            using var inputContig = Ops.AsContiguous(input);
            var inputPtr   = CudaHelpers.GetBufferStart(inputContig);
            var outputPtr  = CudaHelpers.GetBufferStart(output);
            var indicesPtr = CudaHelpers.GetBufferStart(indices);

            var count = (int)output.ElementCount();

            this.Invoke(context, cudaContext, "MaxPoolForward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                        count, inputPtr, batchSize, nInputPlane, iheight, iwidth, oheight, owidth,
                        cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, outputPtr, indicesPtr);
        }