示例#1
0
        private static void Run_M_V_float(TSCudaContext context, Tensor result, Tensor mat, Tensor vec)
        {
            // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices)
            if (mat.Strides[1] != 1)
            {
                throw new ArgumentException("lhs must be contiguous in the last dimension");
            }

            using (var blas = context.BlasForTensor(mat))
            {
                var yPtr = CudaHelpers.GetBufferStart(result);
                var aPtr = CudaHelpers.GetBufferStart(mat);
                var xPtr = CudaHelpers.GetBufferStart(vec);

                Operation trans = Operation.Transpose;
                int       m     = (int)mat.Sizes[1];
                int       n     = (int)mat.Sizes[0];
                int       incx  = (int)vec.Strides[0];
                int       lda   = (int)mat.Strides[0];
                int       incy  = (int)result.Strides[0];
                float     alpha = 1;
                float     beta  = 0;

                CudaBlasNativeMethods.cublasSgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy);
            }
        }
示例#2
0
        public IMatrix TransposeThisAndMultiply(IMatrix matrix)
        {
            Debug.Assert(IsValid && matrix.IsValid);
            var other = (GpuMatrix)matrix;

            Debug.Assert(_rows == other._rows);

            var ret = _cuda.Allocate(_columns * other.ColumnCount);
            int rowsA = _rows, columnsA = _columns, columnsB = other.ColumnCount, rowsB = other.RowCount;

            float alpha = 1.0f, beta = 0.0f;

            CudaBlasNativeMethods.cublasSgemm_v2(_cuda.Blas.CublasHandle,
                                                 Operation.Transpose,
                                                 Operation.NonTranspose,
                                                 columnsA,
                                                 columnsB,
                                                 rowsB,
                                                 ref alpha,
                                                 _data.DevicePointer,
                                                 rowsA,
                                                 other._data.DevicePointer,
                                                 rowsB,
                                                 ref beta,
                                                 ret.DevicePointer,
                                                 columnsA
                                                 );
            return(new GpuMatrix(_cuda, _columns, other.ColumnCount, ret));
        }
示例#3
0
        public IVector Row(int index)
        {
            Debug.Assert(IsValid);
            var ret    = _cuda.Allocate(_columns);
            int offset = index * sizeof(float);

            CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, _columns, _data.DevicePointer + offset, _rows, ret.DevicePointer, 1);
            return(new GpuVector(_cuda, ret));
        }
示例#4
0
        public IVector GetRowSegment(int rowIndex, int columnIndex, int length)
        {
            Debug.Assert(IsValid);
            int offset = (rowIndex + (columnIndex * _rows)) * sizeof(float);
            var ret    = _cuda.Allocate(length);

            CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, length, _data.DevicePointer + offset, _rows, ret.DevicePointer, 1);
            return(new GpuVector(_cuda, ret));
        }
示例#5
0
        public IVector Row(int index)
        {
            Debug.Assert(IsValid);
            var ret    = _cuda.Allocate(ColumnCount);
            int offset = index * CudaProvider.FLOAT_SIZE;

            CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, ColumnCount,
                                                 Memory.DevicePointer + offset, RowCount, ret.DevicePointer, 1);
            return(new GpuVector(_cuda, ret, true));
        }
示例#6
0
        public IVector GetRowSegment(int rowIndex, int columnIndex, int length)
        {
            Debug.Assert(IsValid);
            int offset = (rowIndex + (columnIndex * RowCount)) * CudaProvider.FLOAT_SIZE;
            var ret    = _cuda.Allocate(length);

            CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, length,
                                                 Memory.DevicePointer + offset, RowCount, ret.DevicePointer, 1);
            return(new GpuVector(_cuda, ret, true));
        }
示例#7
0
        public IMatrix Transpose()
        {
            Debug.Assert(IsValid);
            var   ret = _cuda.Allocate(RowCount * ColumnCount);
            float alpha = 1.0f, beta = 0.0f;

            CudaBlasNativeMethods.cublasSgeam(_cuda.Blas.CublasHandle, Operation.Transpose,
                                              Operation.NonTranspose, ColumnCount, RowCount, ref alpha, Memory.DevicePointer, RowCount,
                                              ref beta, new CUdeviceptr(0), ColumnCount, ret.DevicePointer, ColumnCount);
            return(new GpuMatrix(_cuda, ColumnCount, RowCount, ret, true));
        }
示例#8
0
        public I3DTensor TransposeThisAndMultiply(I4DTensor tensor)
        {
            var other = (Gpu4DTensor)tensor;

#if DEBUG
            Debug.Assert(tensor.Count == Depth && IsValid && other.IsValid);
#endif
            var ptr = Memory.DevicePointer;
            var ptr2 = other.Memory.DevicePointer;
            int rowsA = _rows, columnsA = _columns, columnsB = other.Depth,
                rowsB = other.RowCount * other.ColumnCount, blockSize2 = columnsB * rowsB;
            float alpha = 1.0f, beta = 0.0f;
            var   output = new Gpu3DTensor(_cuda, _columns, columnsB, _depth,
                                           _cuda.Allocate(_columns * columnsB * _depth), true);
            var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(_cuda.Blas.CublasHandle,
                                                                         Operation.Transpose, Operation.NonTranspose, columnsA, columnsB, rowsB, ref alpha, ptr,
                                                                         rowsA, _blockSize, ptr2, rowsB, blockSize2, ref beta, output.Memory.DevicePointer, columnsA,
                                                                         _columns * columnsB, _depth);
            if (status != CublasStatus.Success)
            {
                throw new CudaBlasException(status);
            }
            return(output);

            //var output = Enumerable.Range(0, _depth).Select(i => new GpuMatrix(_cuda, _columns, columnsB, _cuda.Allocate(_columns * columnsB), true)).ToList();

            //using (var aPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr + i * _blockSize * CudaProvider.FLOAT_SIZE).ToArray()))
            //using (var bPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr2 + i * blockSize2 * CudaProvider.FLOAT_SIZE).ToArray()))
            //using (var cPtrs = new PtrToDeviceMemoryList(output.Select(m => m.Memory.DevicePointer).ToArray())) {
            //	var status = CudaBlasNativeMethods.cublasSgemmBatched(_cuda.Blas.CublasHandle,
            //		Operation.Transpose,
            //		Operation.NonTranspose,
            //		columnsA,
            //		columnsB,
            //		rowsB,
            //		ref alpha,
            //		aPtrs.DevicePointer,
            //		rowsA,
            //		bPtrs.DevicePointer,
            //		rowsB,
            //		ref beta,
            //		cPtrs.DevicePointer,
            //		columnsA,
            //		_depth
            //	);
            //	if (status != CublasStatus.Success)
            //		throw new CudaBlasException(status);
            //}

            //return _cuda.Create3DTensor(output);
        }
示例#9
0
        public IMatrix TransposeAndMultiply(IMatrix matrix)
        {
            Debug.Assert(IsValid && matrix.IsValid);
            var other = (GpuMatrix)matrix;

            Debug.Assert(ColumnCount == other.ColumnCount);
            var   ret = _cuda.Allocate(RowCount * other.RowCount);
            int   rowsA = RowCount, columnsArowsB = ColumnCount, rowsB = other.RowCount;
            float alpha = 1.0f, beta = 0.0f;

            CudaBlasNativeMethods.cublasSgemm_v2(_cuda.Blas.CublasHandle, Operation.NonTranspose,
                                                 Operation.Transpose, rowsA, rowsB, columnsArowsB, ref alpha, Memory.DevicePointer, rowsA,
                                                 other.Memory.DevicePointer, rowsB, ref beta, ret.DevicePointer, rowsA);
            return(new GpuMatrix(_cuda, RowCount, other.RowCount, ret, true));
        }
示例#10
0
        public IMatrix GetNewMatrixFromRows(IReadOnlyList <int> rowIndices)
        {
            Debug.Assert(IsValid);
            int offset = 0;
            var ret    = _cuda.Allocate(ColumnCount * rowIndices.Count);

            foreach (var item in rowIndices)
            {
                CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, n: ColumnCount,
                                                     x: Memory.DevicePointer + (item * CudaProvider.FLOAT_SIZE), incx: RowCount,
                                                     y: ret.DevicePointer + (offset * CudaProvider.FLOAT_SIZE), incy: rowIndices.Count);
                offset += 1;
            }

            return(new GpuMatrix(_cuda, rowIndices.Count, ColumnCount, ret, true));
        }
示例#11
0
        public I3DTensor Multiply(IMatrix matrix)
        {
            var   other = (GpuMatrix)matrix;
            var   ptr = Memory.DevicePointer;
            int   rowsA = _rows, columnsArowsB = _columns, columnsB = matrix.ColumnCount;
            float alpha = 1.0f, beta = 0.0f;
            var   output = new Gpu3DTensor(_cuda, _rows, columnsB, _depth,
                                           _cuda.Allocate(_rows * columnsB * _depth), true);
            var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(_cuda.Blas.CublasHandle,
                                                                         Operation.NonTranspose, Operation.NonTranspose, rowsA, columnsB, columnsArowsB, ref alpha,
                                                                         ptr, rowsA, _blockSize, other.Memory.DevicePointer, columnsArowsB, 0, ref beta,
                                                                         output.Memory.DevicePointer, rowsA, _rows * columnsB, _depth);

            if (status != CublasStatus.Success)
            {
                throw new CudaBlasException(status);
            }
            return(output);

            //var output = Enumerable.Range(0, _depth).Select(i => new GpuMatrix(_cuda, _rows, columnsB, _cuda.Allocate(_rows * columnsB), true)).ToList();

            //using (var aPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr + i * _blockSize * CudaProvider.FLOAT_SIZE).ToArray()))
            //using (var bPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => other.Memory.DevicePointer).ToArray()))
            //using (var cPtrs = new PtrToDeviceMemoryList(output.Select(m => m.Memory.DevicePointer).ToArray())) {
            //	var status = CudaBlasNativeMethods.cublasSgemmBatched(_cuda.Blas.CublasHandle,
            //		Operation.NonTranspose,
            //		Operation.NonTranspose,
            //		rowsA,
            //		columnsB,
            //		columnsArowsB,
            //		ref alpha,
            //		aPtrs.DevicePointer,
            //		rowsA,
            //		bPtrs.DevicePointer,
            //		columnsArowsB,
            //		ref beta,
            //		cPtrs.DevicePointer,
            //		rowsA,
            //		_depth
            //	);
            //	if (status != CublasStatus.Success)
            //		throw new CudaBlasException(status);
            //}

            //return _cuda.Create3DTensor(output);
        }
示例#12
0
        public IMatrix GetNewMatrixFromRows(IReadOnlyList <int> rowIndices)
        {
            Debug.Assert(IsValid);
            int offset = 0;
            var ret    = _cuda.Allocate(_columns * rowIndices.Count);

            foreach (var item in rowIndices)
            {
                CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle,
                                                     n: _columns,
                                                     x: _data.DevicePointer + (item * sizeof(float)),
                                                     incx: _rows,
                                                     y: ret.DevicePointer + (offset * sizeof(float)),
                                                     incy: rowIndices.Count
                                                     );
                offset += 1;
            }
            return(new GpuMatrix(_cuda, rowIndices.Count, _columns, ret));
        }
示例#13
0
        /// <summary>
        /// Runs the dot float.
        /// </summary>
        /// <param name="context">The context.</param>
        /// <param name="result">The result.</param>
        /// <param name="lhs">The LHS.</param>
        /// <param name="rhs">The RHS.</param>
        /// <exception cref="CudaBlasException"></exception>
        private static void Run_Dot_float(TSCudaContext context, NDArray result, NDArray lhs, NDArray rhs)
        {
            using (var blas = context.BlasForTensor(lhs))
            {
                //var resultPtr = CudaNativeHelpers.GetBufferStart(result);
                var lhsPtr = CudaHelpers.GetBufferStart(lhs);
                var rhsPtr = CudaHelpers.GetBufferStart(rhs);

                int n    = (int)lhs.Shape[0];
                int incx = (int)lhs.Strides[0];
                int incy = (int)rhs.Strides[0];

                float resultVal = 0;
                var   _status   = CudaBlasNativeMethods.cublasSdot_v2(blas.Value.CublasHandle, n, lhsPtr, incx, rhsPtr, incy, ref resultVal);
                if (_status != CublasStatus.Success)
                {
                    throw new CudaBlasException(_status);
                }
                result.Storage.SetElementAsFloat(result.StorageOffset, resultVal);
            }
        }
示例#14
0
        public IMatrix Transpose()
        {
            Debug.Assert(IsValid);
            var   ret = _cuda.Allocate(_rows * _columns);
            float alpha = 1.0f, beta = 0.0f;

            CudaBlasNativeMethods.cublasSgeam(_cuda.Blas.CublasHandle,
                                              Operation.Transpose,
                                              Operation.NonTranspose,
                                              _columns,
                                              _rows,
                                              ref alpha,
                                              _data.DevicePointer,
                                              _rows,
                                              ref beta,
                                              new ManagedCuda.BasicTypes.CUdeviceptr(0),
                                              _columns,
                                              ret.DevicePointer,
                                              _columns
                                              );
            return(new GpuMatrix(_cuda, _columns, _rows, ret));
        }
示例#15
0
        private static void Run_Dot_double(TSCudaContext context, Tensor result, Tensor lhs, Tensor rhs)
        {
            using var blas = context.BlasForTensor(lhs);
            //var resultPtr = CudaNativeHelpers.GetBufferStart(result);
            var lhsPtr = CudaHelpers.GetBufferStart(lhs);
            var rhsPtr = CudaHelpers.GetBufferStart(rhs);

            var n    = (int)lhs.Sizes[0];
            var incx = (int)lhs.Strides[0];
            var incy = (int)rhs.Strides[0];

            // TODO add SetElementAsDouble to prevent need to round to float here
            double resultVal = 0;
            var    _status   = CudaBlasNativeMethods.cublasDdot_v2(blas.Value.CublasHandle, n, lhsPtr, incx, rhsPtr, incy, ref resultVal);

            if (_status != CublasStatus.Success)
            {
                throw new CudaBlasException(_status);
            }

            result.Storage.SetElementAsFloat(result.StorageOffset, (float)resultVal);
        }
示例#16
0
        private static void GemmOp(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[0] != 1)
            {
                throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})");
            }
            if (b.Strides[0] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }
            if (c.Strides[0] != 1)
            {
                throw new ArgumentException("c must be contiguous in the first dimension (column major / fortran order)");
            }

            using (var blas = context.BlasForTensor(c))
            {
                bool      nta    = transA == BlasOp.NonTranspose;
                bool      ntb    = transB == BlasOp.NonTranspose;
                Operation transa = GetCudaBlasOp(transA);
                Operation transb = GetCudaBlasOp(transB);
                int       m      = (int)a.Sizes[nta ? 0 : 1];
                int       k      = (int)b.Sizes[ntb ? 0 : 1];
                int       n      = (int)b.Sizes[ntb ? 1 : 0];
                int       lda    = (int)a.Strides[1];
                int       ldb    = (int)b.Strides[1];
                int       ldc    = (int)c.Strides[1];



                if (c.ElementType == DType.Float32)
                {
                    var aPtrSingle = CudaHelpers.GetBufferStart(a);
                    var bPtrSingle = CudaHelpers.GetBufferStart(b);
                    var cPtrSingle = CudaHelpers.GetBufferStart(c);

                    var _statusF32 = CudaBlasNativeMethods.cublasSgemm_v2(blas.Value.CublasHandle,
                                                                          transa, transb, m, n, k, ref alpha, aPtrSingle, lda, bPtrSingle, ldb, ref beta, cPtrSingle, ldc);
                    if (_statusF32 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF32);
                    }
                }
                else if (c.ElementType == DType.Float64)
                {
                    var aPtrDouble  = CudaHelpers.GetBufferStart(a);
                    var bPtrDouble  = CudaHelpers.GetBufferStart(b);
                    var cPtrDouble  = CudaHelpers.GetBufferStart(c);
                    var alphaDouble = (double)alpha;
                    var betaDouble  = (double)beta;
                    var _statusF64  = CudaBlasNativeMethods.cublasDgemm_v2(blas.Value.CublasHandle,
                                                                           transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, bPtrDouble, ldb, ref betaDouble, cPtrDouble, ldc);
                    if (_statusF64 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF64);
                    }
                }
                else
                {
                    throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }
示例#17
0
        private static void GemmOpBatch(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[1] != 1)
            {
                throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})");
            }

            if (b.Strides[1] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }

            if (c.Strides[1] != 1)
            {
                throw new ArgumentException($"c must be contiguous in the first dimension (column major / fortran order) ({a.Strides[0]}, {a.Strides[1]}, {a.Strides[2]}) ({b.Strides[0]}, {b.Strides[1]}, {b.Strides[2]}) ({c.Strides[0]}, {c.Strides[1]}, {c.Strides[2]})");
            }

            using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(c))
            {
                bool      nta    = transA == BlasOp.NonTranspose;
                bool      ntb    = transB == BlasOp.NonTranspose;
                Operation transa = GetCudaBlasOp(transA);
                Operation transb = GetCudaBlasOp(transB);
                int       m      = (int)a.Sizes[nta ? 1 : 2];
                int       k      = (int)b.Sizes[ntb ? 1 : 2];
                int       n      = (int)b.Sizes[ntb ? 2 : 1];
                int       lda    = (int)a.Strides[2];
                int       ldb    = (int)b.Strides[2];
                int       ldc    = (int)c.Strides[2];

                int stra      = (int)a.Strides[0];
                int strb      = (int)b.Strides[0];
                int strc      = (int)c.Strides[0];
                int batchSize = (int)c.Sizes[0];


                //// Set the math mode to allow cuBLAS to use Tensor Cores:
                //cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);

                CublasStatus status = CudaBlasNativeMethods.cublasSetMathMode(blas.Value.CublasHandle, ManagedCuda.CudaBlas.Math.TensorOpMath);
                if (status != CublasStatus.Success)
                {
                    throw new CudaBlasException($"Failed to set math mode to tensor ops.");
                }


                if (c.ElementType == DType.Float32)
                {
                    CUdeviceptr aPtrSingle = CudaHelpers.GetBufferStart(a);
                    CUdeviceptr bPtrSingle = CudaHelpers.GetBufferStart(b);
                    CUdeviceptr cPtrSingle = CudaHelpers.GetBufferStart(c);

                    CublasStatus _statusF32 = CudaBlasNativeMethods.cublasSgemmStridedBatched(blas.Value.CublasHandle,
                                                                                              transa, transb, m, n, k, ref alpha, aPtrSingle, lda, stra, bPtrSingle, ldb, strb, ref beta, cPtrSingle, ldc, strc, batchSize);
                    if (_statusF32 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF32);
                    }
                }
                else if (c.ElementType == DType.Float64)
                {
                    CUdeviceptr  aPtrDouble  = CudaHelpers.GetBufferStart(a);
                    CUdeviceptr  bPtrDouble  = CudaHelpers.GetBufferStart(b);
                    CUdeviceptr  cPtrDouble  = CudaHelpers.GetBufferStart(c);
                    double       alphaDouble = alpha;
                    double       betaDouble  = beta;
                    CublasStatus _statusF64  = CudaBlasNativeMethods.cublasDgemmStridedBatched(blas.Value.CublasHandle,
                                                                                               transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, stra, bPtrDouble, ldb, strb, ref betaDouble, cPtrDouble, ldc, strc, batchSize);
                    if (_statusF64 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF64);
                    }
                }
                else
                {
                    throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }