Esempio n. 1
0
        private void LayerNorm(TSCudaContext context, Tensor result, Tensor src, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            int  ndim        = src.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(src.Sizes, src.Strides);
            long cols        = src.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);
            CUdeviceptr srcPtr    = CudaHelpers.GetBufferStart(src);
            CUdeviceptr alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gLNormalization", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, srcPtr, alphaPtr, betaPtr, rows, cols, eps);
        }
Esempio n. 2
0
        /// <summary>
        /// Runs the m v double.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="result">The result.</param>
        /// <param name="mat">The mat.</param>
        /// <param name="vec">The vec.</param>
        /// <exception cref="ArgumentException">lhs must be contiguous in the last dimension</exception>
        private static void Run_M_V_double(TSCudaContext context, NDArray result, NDArray mat, NDArray vec)
        {
            // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices)
            if (mat.Strides[1] != 1)
            {
                throw new ArgumentException("lhs must be contiguous in the last dimension");
            }

            using (var blas = context.BlasForTensor(mat))
            {
                var yPtr = CudaHelpers.GetBufferStart(result);
                var aPtr = CudaHelpers.GetBufferStart(mat);
                var xPtr = CudaHelpers.GetBufferStart(vec);

                Operation trans = Operation.Transpose;
                int       m     = (int)mat.Shape[1];
                int       n     = (int)mat.Shape[0];
                int       incx  = (int)vec.Strides[0];
                int       lda   = (int)mat.Strides[0];
                int       incy  = (int)result.Strides[0];
                double    alpha = 1;
                double    beta  = 0;

                CudaBlasNativeMethods.cublasDgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy);
            }
        }
Esempio n. 3
0
        private void UpdateCost(TSCudaContext context, Tensor weight, Tensor ids, Tensor costs)
        {
            CudaContext cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            int  ndim = weight.DimensionCount;
            long rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                rows *= weight.Sizes[dim];
            }

            long cols = weight.Sizes[ndim - 1];

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr weightPtr = CudaHelpers.GetBufferStart(weight);
            CUdeviceptr idsPtr    = CudaHelpers.GetBufferStart(ids);
            CUdeviceptr costsPtr  = CudaHelpers.GetBufferStart(costs);

            Invoke(context, cudaContext, "UpdateCost", grid, threads, 0, CUstream.NullStream, weightPtr, idsPtr, costsPtr, rows, cols);
        }
Esempio n. 4
0
        private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            CudaContext cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();
            int  ndim        = inGrad.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(inGrad.Sizes, inGrad.Strides);
            long cols        = inGrad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr out1GradPtr  = CudaHelpers.GetBufferStart(out1Grad);
            CUdeviceptr out2GradPtr  = CudaHelpers.GetBufferStart(out2Grad);
            CUdeviceptr alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            CUdeviceptr betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            CUdeviceptr inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            CUdeviceptr yPtr         = CudaHelpers.GetBufferStart(y);
            CUdeviceptr x1Ptr        = CudaHelpers.GetBufferStart(x1);
            CUdeviceptr x2Ptr        = CudaHelpers.GetBufferStart(x2);
            CUdeviceptr alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            CUdeviceptr betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, threads.x * sizeof(float) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Esempio n. 5
0
        private void ReduceIndexOuterDim(TSCudaContext context, Tensor resultValues, Tensor resultIndices, Tensor src, int dimension, Tuple <float, float> init, string baseKernelName)
        {
            CudaContext cudaContext = context.CudaContextForTensor(src);

            int  ndim      = src.DimensionCount;
            long num_orows = 1;

            for (int dim = 0; dim < dimension; dim++)
            {
                num_orows *= src.Sizes[dim];
            }
            long row_size  = src.Sizes[dimension];
            long num_irows = 1;

            for (int dim = dimension + 1; dim < ndim; dim++)
            {
                num_irows *= src.Sizes[dim];
            }

            dim3 threads    = new dim3((uint)Math.Min(512, num_irows));
            int  maxGridDim = 1024;
            dim3 grid       = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x)));

            CUdeviceptr resultValPtr = CudaHelpers.GetBufferStart(resultValues);
            CUdeviceptr resultIdxPtr = CudaHelpers.GetBufferStart(resultIndices);
            CUdeviceptr srcPtr       = CudaHelpers.GetBufferStart(src);

            string kernelName = "outer_index_" + baseKernelName;

            Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultValPtr, resultIdxPtr, srcPtr, num_orows, num_irows, row_size, init.Item1, init.Item2);
        }
Esempio n. 6
0
        private void BuildTriMask(TSCudaContext context, Tensor result, float value, float maskedValue)
        {
            CudaContext cudaContext = context.CudaContextForTensor(result);

            cudaContext.SetCurrent();

            int  ndim        = result.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(result.Sizes, result.Strides);
            long cols        = result.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;


            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr resultPtr = CudaHelpers.GetBufferStart(result);


            Invoke(context, cudaContext, "BuildTriMask", grid, threads, threads.x * sizeof(float), CUstream.NullStream, resultPtr, rows, cols, value, maskedValue);
        }
Esempio n. 7
0
        private void Adam(TSCudaContext context, Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var rows = weight.Sizes[0];
            var cols = weight.Sizes[1];

            var  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var weightPtr   = CudaHelpers.GetBufferStart(weight);
            var gradientPtr = CudaHelpers.GetBufferStart(gradient);
            var vPtr        = CudaHelpers.GetBufferStart(v);
            var mPtr        = CudaHelpers.GetBufferStart(m);

            Invoke(context, cudaContext, "Adam", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, vPtr, mPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps);
        }
Esempio n. 8
0
        //__global__ void SGD(float* w, float* g, float* c, float* l, unsigned rows, unsigned cols, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        private void SGD(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, Tensor lrw, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            var cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            var rows = weight.Sizes[0];
            var cols = weight.Sizes[1];

            var  ndim     = weight.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= weight.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var weightPtr   = CudaHelpers.GetBufferStart(weight);
            var gradientPtr = CudaHelpers.GetBufferStart(gradient);
            var cachePtr    = CudaHelpers.GetBufferStart(cache);
            var lrwPtr      = CudaHelpers.GetBufferStart(lrw);

            Invoke(context, cudaContext, "SGD", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, weightPtr, gradientPtr, cachePtr, lrwPtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Esempio n. 9
0
        private void AddLayerNormGrad(TSCudaContext context, Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(inGrad);

            cudaContext.SetCurrent();

            var rows = inGrad.Sizes[0];
            var cols = inGrad.Sizes[1];

            var  ndim     = inGrad.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= inGrad.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var out1GradPtr  = CudaHelpers.GetBufferStart(out1Grad);
            var out2GradPtr  = CudaHelpers.GetBufferStart(out2Grad);
            var alphaGradPtr = CudaHelpers.GetBufferStart(alphaGrad);
            var betaGradPtr  = CudaHelpers.GetBufferStart(betaGrad);
            var inGradPtr    = CudaHelpers.GetBufferStart(inGrad);
            var yPtr         = CudaHelpers.GetBufferStart(y);
            var x1Ptr        = CudaHelpers.GetBufferStart(x1);
            var x2Ptr        = CudaHelpers.GetBufferStart(x2);
            var alphaPtr     = CudaHelpers.GetBufferStart(alpha);
            var betaPtr      = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLayerNormalizationGrad", grid, threads, (uint)(threads.x * sizeof(float)) * 4, CUstream.NullStream, out1GradPtr, out2GradPtr, alphaGradPtr, betaGradPtr, inGradPtr, yPtr, x1Ptr, x2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
Esempio n. 10
0
        private void AddLayerNorm(TSCudaContext context, Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            var cudaContext = context.CudaContextForTensor(src1);

            cudaContext.SetCurrent();

            var rows = src1.Sizes[0];
            var cols = src1.Sizes[1];

            var  ndim     = src1.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src1.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var resultPtr = CudaHelpers.GetBufferStart(result);
            var src1Ptr   = CudaHelpers.GetBufferStart(src1);
            var src2Ptr   = CudaHelpers.GetBufferStart(src2);
            var alphaPtr  = CudaHelpers.GetBufferStart(alpha);
            var betaPtr   = CudaHelpers.GetBufferStart(beta);


            Invoke(context, cudaContext, "gAddLNormalization", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, src1Ptr, src2Ptr, alphaPtr, betaPtr, rows, cols, eps);
        }
        public void SpatialMaxPoolingBackward(Tensor input, Tensor gradOutput, Tensor gradInput, Tensor indices, ConvolutionDesc2d cd, bool ceilMode)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(gradOutput);
            CudaContext   cudaContext = context.CudaContextForTensor(gradOutput);

            int dimw = 3;
            int dimh = 2;
            int dimc = 1;

            long nbatch  = input.Sizes[0];
            long nslices = input.Sizes[dimc];
            long iheight = input.Sizes[dimh];
            long iwidth  = input.Sizes[dimw];
            long owidth  = gradOutput.Sizes[dimw];
            long oheight = gradOutput.Sizes[dimh];


            using (Tensor gradOutputContig = Ops.AsContiguous(gradOutput))
            {
                CUdeviceptr gradOutputPtr = CudaHelpers.GetBufferStart(gradOutputContig);
                CUdeviceptr indicesPtr    = CudaHelpers.GetBufferStart(indices);
                CUdeviceptr gradInputPtr  = CudaHelpers.GetBufferStart(gradInput);

                int count = (int)input.ElementCount();

                Invoke(context, cudaContext, "MaxPoolBackward", new dim3(NNThreads.NumBlocks(count)), new dim3(NNThreads.NumThreads), 0, CUstream.NullStream,
                       count, gradOutputPtr, indicesPtr, nbatch, nslices, iheight, iwidth, oheight, owidth,
                       cd.kH, cd.kW, cd.dH, cd.dW, cd.padH, cd.padW, gradInputPtr);
            }
        }
Esempio n. 12
0
        public static Tensor Invoke(CudaCode kernels, string funcName, Tensor result, Tensor src)
        {
            try
            {
                TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
                CudaContext   cudaContext = context.CudaContextForTensor(src);

                cudaContext.SetCurrent();

                Tensor writeTarget  = TensorResultBuilder.GetWriteTarget(result, src, false, src.Sizes);
                long   elementCount = writeTarget.ElementCount();

                byte[] ptx = kernels.GetPtx(context.Compiler);

                if (result == src)
                {
                    ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t1_" + funcName, writeTarget, elementCount);
                }
                else
                {
                    ApplyOpInvoke.Invoke(context, cudaContext, ptx, "t2_" + funcName, writeTarget, src, elementCount);
                }

                return(writeTarget);
            }
            catch (Exception e)
            {
                Logger.WriteLine($"Error = '{e.Message}', Call stack = '{e.StackTrace}'");
                throw;
            }
        }
Esempio n. 13
0
        public static void Invoke(TSCudaContext context, CudaContext cudaContext, byte[] ptx, string baseName, params object[] args)
        {
            ThrowIfAnyTensorInvalid(args);

            try
            {
                cudaContext.SetCurrent();

                CudaDeviceProperties deviceInfo = context.DeviceInfoForContext(cudaContext);

                IEnumerable <Tensor> allTensors = args.OfType <Tensor>();
                Tensor firstTensor       = allTensors.First();
                long   elementCount      = firstTensor.ElementCount();
                ApplySpecialization spec = new ApplySpecialization(allTensors.ToArray());

                ConvertTensorArgs.Convert(cudaContext, spec.Use32BitIndices, args);

                ManagedCuda.VectorTypes.dim3 block = ApplyUtils.GetApplyBlock();
                ManagedCuda.VectorTypes.dim3 grid  = ApplyUtils.GetApplyGrid(deviceInfo, elementCount);

                string     fullKernelName = PermutationGenerator.GetMangledName(baseName, spec);
                CudaKernel kernel         = context.KernelCache.Get(cudaContext, ptx, fullKernelName);

                kernel.GridDimensions  = grid;
                kernel.BlockDimensions = block;
                kernel.RunAsync(CUstream.NullStream, args);
            }
            catch (Exception ex)
            {
                Logger.WriteLine($"Exception message = {ex.Message}, Call stack = {ex.StackTrace}");
                throw;
            }
        }
Esempio n. 14
0
        private void VarOuterDim(TSCudaContext context, Tensor result, Tensor src, int dimension, bool normByN, bool applySqrt)
        {
            var cudaContext = context.CudaContextForTensor(src);

            var  ndim      = src.DimensionCount;
            long num_orows = 1;

            for (var dim = 0; dim < dimension; dim++)
            {
                num_orows *= src.Sizes[dim];
            }
            var row_size = src.Sizes[dimension];
            // Treat all inner dimensions (i.e. dim > dimension) as one.
            long num_irows = 1;

            for (var dim = dimension + 1; dim < ndim; dim++)
            {
                num_irows *= src.Sizes[dim];
            }

            var threads    = new dim3((uint)Math.Min(512, num_irows));
            var maxGridDim = 1024;
            var grid       = new dim3((uint)Math.Min(maxGridDim, num_orows), (uint)Math.Min(maxGridDim, ApplyUtils.CeilDiv(num_irows, threads.x)));

            var resultPtr  = CudaHelpers.GetBufferStart(result);
            var srcPtr     = CudaHelpers.GetBufferStart(src);
            var kernelName = "kernel_varOuterDim" + GetMangledNameSuffix(normByN, applySqrt);

            this.Invoke(context, cudaContext, kernelName, grid, threads, 0, CUstream.NullStream, resultPtr, srcPtr, num_orows, num_irows, row_size);
        }
Esempio n. 15
0
        private void RMSProp(TSCudaContext context, Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            CudaContext cudaContext = context.CudaContextForTensor(weight);

            cudaContext.SetCurrent();

            int  ndim        = weight.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(weight.Sizes, weight.Strides);
            long cols        = weight.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr weightPtr   = CudaHelpers.GetBufferStart(weight);
            CUdeviceptr gradientPtr = CudaHelpers.GetBufferStart(gradient);
            CUdeviceptr cachePtr    = CudaHelpers.GetBufferStart(cache);

            Invoke(context, cudaContext, "RMSProp", grid, threads, 0, CUstream.NullStream, weightPtr, gradientPtr, cachePtr, rows, cols, batchSize, step_size, clipval, regc, decay_rate, eps);
        }
Esempio n. 16
0
        private void Softmax(TSCudaContext context, Tensor result, Tensor src)
        {
            var cudaContext = context.CudaContextForTensor(src);

            cudaContext.SetCurrent();

            var rows = src.Sizes[0];
            var cols = src.Sizes[1];

            var  ndim     = src.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= src.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var resultPtr = CudaHelpers.GetBufferStart(result);
            var srcPtr    = CudaHelpers.GetBufferStart(src);

            Invoke(context, cudaContext, "gSoftmax", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, resultPtr, srcPtr, rows, cols);
        }
Esempio n. 17
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            CudaContext cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            int  ndim        = grad.DimensionCount;
            long storageSize = TensorDimensionHelpers.GetStorageSize(grad.Sizes, grad.Strides);
            long cols        = grad.Sizes[ndim - 1];

            if (storageSize % cols != 0)
            {
                throw new Exception($"Invalid tensor storage size = '{storageSize}', and cols = '{cols}'");
            }

            long rows = storageSize / cols;

            int iAddGrad = addGrad ? 1 : 0;

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));

            CUdeviceptr gradPtr = CudaHelpers.GetBufferStart(grad);
            CUdeviceptr adjPtr  = CudaHelpers.GetBufferStart(adj);
            CUdeviceptr valPtr  = CudaHelpers.GetBufferStart(val);

            Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, threads.x * sizeof(float), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols, iAddGrad);
        }
Esempio n. 18
0
        private void SoftmaxGrad(TSCudaContext context, Tensor grad, Tensor adj, Tensor val)
        {
            var cudaContext = context.CudaContextForTensor(grad);

            cudaContext.SetCurrent();

            var rows = grad.Sizes[0];
            var cols = grad.Sizes[1];

            var  ndim     = grad.DimensionCount;
            long num_rows = 1;

            for (var dim = 0; dim < ndim - 1; dim++)
            {
                num_rows *= grad.Sizes[dim];
            }

            var threads = new dim3((uint)Math.Min(512, num_rows));
            var grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(num_rows, threads.y)));

            var gradPtr = CudaHelpers.GetBufferStart(grad);
            var adjPtr  = CudaHelpers.GetBufferStart(adj);
            var valPtr  = CudaHelpers.GetBufferStart(val);

            Invoke(context, cudaContext, "gSoftmaxGrad", grid, threads, (uint)(threads.x * sizeof(float)), CUstream.NullStream, gradPtr, adjPtr, valPtr, rows, cols);
        }
Esempio n. 19
0
        //BuildSrcTgtMask(float* weights, int* originalSrcLengths, int* originalTgtLengths, int batchSize, unsigned rows, unsigned cols)

        private void BuildSrcTgtMask(TSCudaContext context, Tensor mask, Tensor originalSrcLengths, Tensor originalTgtLengths, int batchSize)
        {
            CudaContext cudaContext = context.CudaContextForTensor(mask);

            cudaContext.SetCurrent();

            int  ndim = mask.DimensionCount;
            long rows = 1;

            for (int dim = 0; dim < ndim - 1; dim++)
            {
                rows *= mask.Sizes[dim];
            }

            long cols = mask.Sizes[ndim - 1];

            dim3 threads = new dim3((uint)Math.Min(512, rows));
            dim3 grid    = new dim3((uint)Math.Min(1024, ApplyUtils.CeilDiv(rows, threads.y)));


            CUdeviceptr maskPtr = CudaHelpers.GetBufferStart(mask);
            CUdeviceptr originalSrcLengthsPtr = CudaHelpers.GetBufferStart(originalSrcLengths);
            CUdeviceptr originalTgtLengthsPtr = CudaHelpers.GetBufferStart(originalTgtLengths);


            Invoke(context, cudaContext, "BuildSrcTgtMask", grid, threads, 0, CUstream.NullStream, maskPtr, originalSrcLengthsPtr, originalTgtLengthsPtr, batchSize, rows, cols);
        }
        public static Tensor Mul_M_M(TSCudaContext context, Tensor result, Tensor lhs, Tensor rhs)
        {
            if (lhs.ElementType != rhs.ElementType || (result != null && result.ElementType != lhs.ElementType))
            {
                throw new InvalidOperationException("All tensors must have the same element type");
            }
            CudaHelpers.ThrowIfDifferentDevices(result, lhs, rhs);
            if (result != null && !(result.Storage is CudaStorage))
            {
                throw new ArgumentException("result must be a CUDA tensor", "result");
            }
            if (!(lhs.Storage is CudaStorage))
            {
                throw new ArgumentException("lhs must be a CUDA tensor", "lhs");
            }
            if (!(rhs.Storage is CudaStorage))
            {
                throw new ArgumentException("rhs must be a CUDA tensor", "rhs");
            }


            var writeTarget = TensorResultBuilder.GetWriteTarget(result, lhs, false, lhs.Sizes[0], rhs.Sizes[1]);

            Gemm(context, 1, lhs, rhs, 0, writeTarget);

            return(writeTarget);
        }
Esempio n. 21
0
        /// <summary>
        /// Invokes the specified kernels.
        /// </summary>
        /// <param name="kernels">The kernels.</param>
        /// <param name="context">The context.</param>
        /// <param name="cudaContext">The cuda context.</param>
        /// <param name="result">The result.</param>
        /// <param name="src">The source.</param>
        public static void Invoke(FillCopyKernels kernels, TSCudaContext context, CudaContext cudaContext, Tensor result, Tensor src)
        {
            var ptx          = kernels.GetPtx(context.Compiler);
            var elementCount = result.ElementCount();

            ApplyOpInvoke.Invoke(context, cudaContext, ptx, "copy", result, src, elementCount);
        }
Esempio n. 22
0
        public void CopyGpuToCpu(Tensor result, Tensor src, long totalElements)
        {
            TSCudaContext context    = CudaHelpers.TSContextForTensor(src);
            CudaContext   srcContext = context.CudaContextForTensor(src);

            using (Tensor srcContig = Ops.AsContiguous(src))
                using (Tensor resultContig = AsTypeCpu(result, src.ElementType, true))
                {
                    IntPtr      resultContigPtr = ((Cpu.CpuStorage)resultContig.Storage).PtrAtElement(resultContig.StorageOffset);
                    CUdeviceptr srcContigPtr    = ((CudaStorage)srcContig.Storage).DevicePtrAtElement(srcContig.StorageOffset);

                    long totalBytes = totalElements * srcContig.ElementType.Size();

                    // Use DriverAPINativeMethods directly here instead of CudaContext.CopyToHost, because CopyToHost only has an overload
                    // for specifying totalBytes as a uint, but we may exceed the range of a uint here.
                    CUResult res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(resultContigPtr, srcContigPtr, totalBytes);
                    if (res != CUResult.Success)
                    {
                        throw new CudaException(res);
                    }

                    if (result.Storage != resultContig.Storage)
                    {
                        Ops.Copy(result, resultContig); // copy on CPU
                    }
                }
        }
Esempio n. 23
0
        public Tensor Scatter(Tensor result, Tensor src, int dim, Tensor indices)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            CudaContext   cudaContext = context.CudaContextForTensor(src);

            if (result == null)
            {
                throw new ArgumentNullException("result");
            }

            if (result.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("result and src must have same number of dimensions");
            }

            if (dim < 0 && dim >= result.DimensionCount)
            {
                throw new ArgumentOutOfRangeException("dim");
            }

            if (indices.DimensionCount != src.DimensionCount)
            {
                throw new InvalidOperationException("src and indices must have same number of dimensions");
            }

            if (!src.IsSameSizeAs(indices))
            {
                throw new InvalidOperationException("src and indices must be the same size");
            }

            if (!TensorResultBuilder.ArrayEqualExcept(src.Sizes, result.Sizes, dim))
            {
                throw new InvalidOperationException("result and src must be the same size except in dimension dim");
            }

            Tensor writeTarget = result;

            long nElement = indices.ElementCount();
            dim3 block    = ApplyUtils.GetApplyBlock();
            dim3 grid     = ApplyUtils.GetApplyGrid(context.DeviceInfoForContext(cudaContext), nElement);

            if (ApplyUtils.CanUse32BitIndexMath(writeTarget) &&
                ApplyUtils.CanUse32BitIndexMath(src) &&
                ApplyUtils.CanUse32BitIndexMath(indices))
            {
                int    dims       = indices.DimensionCount <= 3 ? indices.DimensionCount : -1;
                string kernelName = MakeKernelName(ScatterBaseName, true, dims);
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, true,
                       writeTarget, src, indices, dim, (int)nElement);
            }
            else
            {
                string kernelName = MakeKernelName(ScatterBaseName, false, -1);
                Invoke(context, cudaContext, kernelName, grid, block, 0, CUstream.NullStream, false,
                       writeTarget, src, indices, dim, nElement);
            }

            return(writeTarget);
        }
Esempio n. 24
0
        public Tensor SoftmaxGrad(Tensor grad, Tensor adj, Tensor val, bool addGrad = true)
        {
            TSCudaContext context = CudaHelpers.TSContextForTensor(grad);

            SoftmaxGrad(context, grad, adj, val, addGrad);

            return(grad);
        }
Esempio n. 25
0
        public Tensor RMSProp(Tensor weight, Tensor gradient, Tensor cache, int batchSize, float step_size, float clipval, float regc, float decay_rate, float eps)
        {
            TSCudaContext context = CudaHelpers.TSContextForTensor(weight);

            RMSProp(context, weight, gradient, cache, batchSize, step_size, clipval, regc, decay_rate, eps);

            return(weight);
        }
Esempio n. 26
0
        public Tensor Adam(Tensor weight, Tensor gradient, Tensor v, Tensor m, int batchSize, float step_size, float clipval, float regc, float decay_rate_v, float decay_rate_m, int iter, float eps)
        {
            TSCudaContext context = CudaHelpers.TSContextForTensor(weight);

            Adam(context, weight, gradient, v, m, batchSize, step_size, clipval, regc, decay_rate_v, decay_rate_m, iter, eps);

            return(weight);
        }
Esempio n. 27
0
        public void AddLayerNormGrad(Tensor out1Grad, Tensor out2Grad, Tensor alphaGrad, Tensor betaGrad, Tensor inGrad, Tensor y, Tensor x1, Tensor x2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            TSCudaContext context      = CudaHelpers.TSContextForTensor(inGrad);
            Tensor        writeTarget1 = TensorResultBuilder.GetWriteTarget(out1Grad, inGrad, false, inGrad.Sizes);
            Tensor        writeTarget2 = TensorResultBuilder.GetWriteTarget(out2Grad, inGrad, false, inGrad.Sizes);

            AddLayerNormGrad(context, writeTarget1, writeTarget2, alphaGrad, betaGrad, inGrad, y, x1, x2, alpha, beta, eps);
        }
Esempio n. 28
0
        public Tensor BuildTriMask(Tensor result, float value, float maskedValue)
        {
            TSCudaContext context = CudaHelpers.TSContextForTensor(result);

            BuildTriMask(context, result, value, maskedValue);

            return(result);
        }
Esempio n. 29
0
        public Tensor Softmax(Tensor result, Tensor src)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(result, src, true, src.Sizes);

            Softmax(context, writeTarget, src);

            return(writeTarget);
        }
Esempio n. 30
0
        public Tensor AddLayerNorm(Tensor result, Tensor src1, Tensor src2, Tensor alpha, Tensor beta, float eps = 1e-9f)
        {
            TSCudaContext context     = CudaHelpers.TSContextForTensor(src1);
            Tensor        writeTarget = TensorResultBuilder.GetWriteTarget(result, src1, false, src1.Sizes);

            AddLayerNorm(context, writeTarget, src1, src2, alpha, beta, eps);

            return(writeTarget);
        }