private static void Run_M_V_float(TSCudaContext context, Tensor result, Tensor mat, Tensor vec) { // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices) if (mat.Strides[1] != 1) { throw new ArgumentException("lhs must be contiguous in the last dimension"); } using (var blas = context.BlasForTensor(mat)) { var yPtr = CudaHelpers.GetBufferStart(result); var aPtr = CudaHelpers.GetBufferStart(mat); var xPtr = CudaHelpers.GetBufferStart(vec); Operation trans = Operation.Transpose; int m = (int)mat.Sizes[1]; int n = (int)mat.Sizes[0]; int incx = (int)vec.Strides[0]; int lda = (int)mat.Strides[0]; int incy = (int)result.Strides[0]; float alpha = 1; float beta = 0; CudaBlasNativeMethods.cublasSgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy); } }
public IMatrix TransposeThisAndMultiply(IMatrix matrix) { Debug.Assert(IsValid && matrix.IsValid); var other = (GpuMatrix)matrix; Debug.Assert(_rows == other._rows); var ret = _cuda.Allocate(_columns * other.ColumnCount); int rowsA = _rows, columnsA = _columns, columnsB = other.ColumnCount, rowsB = other.RowCount; float alpha = 1.0f, beta = 0.0f; CudaBlasNativeMethods.cublasSgemm_v2(_cuda.Blas.CublasHandle, Operation.Transpose, Operation.NonTranspose, columnsA, columnsB, rowsB, ref alpha, _data.DevicePointer, rowsA, other._data.DevicePointer, rowsB, ref beta, ret.DevicePointer, columnsA ); return(new GpuMatrix(_cuda, _columns, other.ColumnCount, ret)); }
public IVector Row(int index) { Debug.Assert(IsValid); var ret = _cuda.Allocate(_columns); int offset = index * sizeof(float); CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, _columns, _data.DevicePointer + offset, _rows, ret.DevicePointer, 1); return(new GpuVector(_cuda, ret)); }
public IVector GetRowSegment(int rowIndex, int columnIndex, int length) { Debug.Assert(IsValid); int offset = (rowIndex + (columnIndex * _rows)) * sizeof(float); var ret = _cuda.Allocate(length); CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, length, _data.DevicePointer + offset, _rows, ret.DevicePointer, 1); return(new GpuVector(_cuda, ret)); }
public IVector Row(int index) { Debug.Assert(IsValid); var ret = _cuda.Allocate(ColumnCount); int offset = index * CudaProvider.FLOAT_SIZE; CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, ColumnCount, Memory.DevicePointer + offset, RowCount, ret.DevicePointer, 1); return(new GpuVector(_cuda, ret, true)); }
public IVector GetRowSegment(int rowIndex, int columnIndex, int length) { Debug.Assert(IsValid); int offset = (rowIndex + (columnIndex * RowCount)) * CudaProvider.FLOAT_SIZE; var ret = _cuda.Allocate(length); CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, length, Memory.DevicePointer + offset, RowCount, ret.DevicePointer, 1); return(new GpuVector(_cuda, ret, true)); }
public IMatrix Transpose() { Debug.Assert(IsValid); var ret = _cuda.Allocate(RowCount * ColumnCount); float alpha = 1.0f, beta = 0.0f; CudaBlasNativeMethods.cublasSgeam(_cuda.Blas.CublasHandle, Operation.Transpose, Operation.NonTranspose, ColumnCount, RowCount, ref alpha, Memory.DevicePointer, RowCount, ref beta, new CUdeviceptr(0), ColumnCount, ret.DevicePointer, ColumnCount); return(new GpuMatrix(_cuda, ColumnCount, RowCount, ret, true)); }
public I3DTensor TransposeThisAndMultiply(I4DTensor tensor) { var other = (Gpu4DTensor)tensor; #if DEBUG Debug.Assert(tensor.Count == Depth && IsValid && other.IsValid); #endif var ptr = Memory.DevicePointer; var ptr2 = other.Memory.DevicePointer; int rowsA = _rows, columnsA = _columns, columnsB = other.Depth, rowsB = other.RowCount * other.ColumnCount, blockSize2 = columnsB * rowsB; float alpha = 1.0f, beta = 0.0f; var output = new Gpu3DTensor(_cuda, _columns, columnsB, _depth, _cuda.Allocate(_columns * columnsB * _depth), true); var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(_cuda.Blas.CublasHandle, Operation.Transpose, Operation.NonTranspose, columnsA, columnsB, rowsB, ref alpha, ptr, rowsA, _blockSize, ptr2, rowsB, blockSize2, ref beta, output.Memory.DevicePointer, columnsA, _columns * columnsB, _depth); if (status != CublasStatus.Success) { throw new CudaBlasException(status); } return(output); //var output = Enumerable.Range(0, _depth).Select(i => new GpuMatrix(_cuda, _columns, columnsB, _cuda.Allocate(_columns * columnsB), true)).ToList(); //using (var aPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr + i * _blockSize * CudaProvider.FLOAT_SIZE).ToArray())) //using (var bPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr2 + i * blockSize2 * CudaProvider.FLOAT_SIZE).ToArray())) //using (var cPtrs = new PtrToDeviceMemoryList(output.Select(m => m.Memory.DevicePointer).ToArray())) { // var status = CudaBlasNativeMethods.cublasSgemmBatched(_cuda.Blas.CublasHandle, // Operation.Transpose, // Operation.NonTranspose, // columnsA, // columnsB, // rowsB, // ref alpha, // aPtrs.DevicePointer, // rowsA, // bPtrs.DevicePointer, // rowsB, // ref beta, // cPtrs.DevicePointer, // columnsA, // _depth // ); // if (status != CublasStatus.Success) // throw new CudaBlasException(status); //} //return _cuda.Create3DTensor(output); }
public IMatrix TransposeAndMultiply(IMatrix matrix) { Debug.Assert(IsValid && matrix.IsValid); var other = (GpuMatrix)matrix; Debug.Assert(ColumnCount == other.ColumnCount); var ret = _cuda.Allocate(RowCount * other.RowCount); int rowsA = RowCount, columnsArowsB = ColumnCount, rowsB = other.RowCount; float alpha = 1.0f, beta = 0.0f; CudaBlasNativeMethods.cublasSgemm_v2(_cuda.Blas.CublasHandle, Operation.NonTranspose, Operation.Transpose, rowsA, rowsB, columnsArowsB, ref alpha, Memory.DevicePointer, rowsA, other.Memory.DevicePointer, rowsB, ref beta, ret.DevicePointer, rowsA); return(new GpuMatrix(_cuda, RowCount, other.RowCount, ret, true)); }
public IMatrix GetNewMatrixFromRows(IReadOnlyList <int> rowIndices) { Debug.Assert(IsValid); int offset = 0; var ret = _cuda.Allocate(ColumnCount * rowIndices.Count); foreach (var item in rowIndices) { CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, n: ColumnCount, x: Memory.DevicePointer + (item * CudaProvider.FLOAT_SIZE), incx: RowCount, y: ret.DevicePointer + (offset * CudaProvider.FLOAT_SIZE), incy: rowIndices.Count); offset += 1; } return(new GpuMatrix(_cuda, rowIndices.Count, ColumnCount, ret, true)); }
public I3DTensor Multiply(IMatrix matrix) { var other = (GpuMatrix)matrix; var ptr = Memory.DevicePointer; int rowsA = _rows, columnsArowsB = _columns, columnsB = matrix.ColumnCount; float alpha = 1.0f, beta = 0.0f; var output = new Gpu3DTensor(_cuda, _rows, columnsB, _depth, _cuda.Allocate(_rows * columnsB * _depth), true); var status = CudaBlasNativeMethods.cublasSgemmStridedBatched(_cuda.Blas.CublasHandle, Operation.NonTranspose, Operation.NonTranspose, rowsA, columnsB, columnsArowsB, ref alpha, ptr, rowsA, _blockSize, other.Memory.DevicePointer, columnsArowsB, 0, ref beta, output.Memory.DevicePointer, rowsA, _rows * columnsB, _depth); if (status != CublasStatus.Success) { throw new CudaBlasException(status); } return(output); //var output = Enumerable.Range(0, _depth).Select(i => new GpuMatrix(_cuda, _rows, columnsB, _cuda.Allocate(_rows * columnsB), true)).ToList(); //using (var aPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => ptr + i * _blockSize * CudaProvider.FLOAT_SIZE).ToArray())) //using (var bPtrs = new PtrToDeviceMemoryList(Enumerable.Range(0, _depth).Select(i => other.Memory.DevicePointer).ToArray())) //using (var cPtrs = new PtrToDeviceMemoryList(output.Select(m => m.Memory.DevicePointer).ToArray())) { // var status = CudaBlasNativeMethods.cublasSgemmBatched(_cuda.Blas.CublasHandle, // Operation.NonTranspose, // Operation.NonTranspose, // rowsA, // columnsB, // columnsArowsB, // ref alpha, // aPtrs.DevicePointer, // rowsA, // bPtrs.DevicePointer, // columnsArowsB, // ref beta, // cPtrs.DevicePointer, // rowsA, // _depth // ); // if (status != CublasStatus.Success) // throw new CudaBlasException(status); //} //return _cuda.Create3DTensor(output); }
public IMatrix GetNewMatrixFromRows(IReadOnlyList <int> rowIndices) { Debug.Assert(IsValid); int offset = 0; var ret = _cuda.Allocate(_columns * rowIndices.Count); foreach (var item in rowIndices) { CudaBlasNativeMethods.cublasScopy_v2(_cuda.Blas.CublasHandle, n: _columns, x: _data.DevicePointer + (item * sizeof(float)), incx: _rows, y: ret.DevicePointer + (offset * sizeof(float)), incy: rowIndices.Count ); offset += 1; } return(new GpuMatrix(_cuda, rowIndices.Count, _columns, ret)); }
/// <summary> /// Runs the dot float. /// </summary> /// <param name="context">The context.</param> /// <param name="result">The result.</param> /// <param name="lhs">The LHS.</param> /// <param name="rhs">The RHS.</param> /// <exception cref="CudaBlasException"></exception> private static void Run_Dot_float(TSCudaContext context, NDArray result, NDArray lhs, NDArray rhs) { using (var blas = context.BlasForTensor(lhs)) { //var resultPtr = CudaNativeHelpers.GetBufferStart(result); var lhsPtr = CudaHelpers.GetBufferStart(lhs); var rhsPtr = CudaHelpers.GetBufferStart(rhs); int n = (int)lhs.Shape[0]; int incx = (int)lhs.Strides[0]; int incy = (int)rhs.Strides[0]; float resultVal = 0; var _status = CudaBlasNativeMethods.cublasSdot_v2(blas.Value.CublasHandle, n, lhsPtr, incx, rhsPtr, incy, ref resultVal); if (_status != CublasStatus.Success) { throw new CudaBlasException(_status); } result.Storage.SetElementAsFloat(result.StorageOffset, resultVal); } }
public IMatrix Transpose() { Debug.Assert(IsValid); var ret = _cuda.Allocate(_rows * _columns); float alpha = 1.0f, beta = 0.0f; CudaBlasNativeMethods.cublasSgeam(_cuda.Blas.CublasHandle, Operation.Transpose, Operation.NonTranspose, _columns, _rows, ref alpha, _data.DevicePointer, _rows, ref beta, new ManagedCuda.BasicTypes.CUdeviceptr(0), _columns, ret.DevicePointer, _columns ); return(new GpuMatrix(_cuda, _columns, _rows, ret)); }
private static void Run_Dot_double(TSCudaContext context, Tensor result, Tensor lhs, Tensor rhs) { using var blas = context.BlasForTensor(lhs); //var resultPtr = CudaNativeHelpers.GetBufferStart(result); var lhsPtr = CudaHelpers.GetBufferStart(lhs); var rhsPtr = CudaHelpers.GetBufferStart(rhs); var n = (int)lhs.Sizes[0]; var incx = (int)lhs.Strides[0]; var incy = (int)rhs.Strides[0]; // TODO add SetElementAsDouble to prevent need to round to float here double resultVal = 0; var _status = CudaBlasNativeMethods.cublasDdot_v2(blas.Value.CublasHandle, n, lhsPtr, incx, rhsPtr, incy, ref resultVal); if (_status != CublasStatus.Success) { throw new CudaBlasException(_status); } result.Storage.SetElementAsFloat(result.StorageOffset, (float)resultVal); }
private static void GemmOp(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c) { if (a.Strides[0] != 1) { throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})"); } if (b.Strides[0] != 1) { throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)"); } if (c.Strides[0] != 1) { throw new ArgumentException("c must be contiguous in the first dimension (column major / fortran order)"); } using (var blas = context.BlasForTensor(c)) { bool nta = transA == BlasOp.NonTranspose; bool ntb = transB == BlasOp.NonTranspose; Operation transa = GetCudaBlasOp(transA); Operation transb = GetCudaBlasOp(transB); int m = (int)a.Sizes[nta ? 0 : 1]; int k = (int)b.Sizes[ntb ? 0 : 1]; int n = (int)b.Sizes[ntb ? 1 : 0]; int lda = (int)a.Strides[1]; int ldb = (int)b.Strides[1]; int ldc = (int)c.Strides[1]; if (c.ElementType == DType.Float32) { var aPtrSingle = CudaHelpers.GetBufferStart(a); var bPtrSingle = CudaHelpers.GetBufferStart(b); var cPtrSingle = CudaHelpers.GetBufferStart(c); var _statusF32 = CudaBlasNativeMethods.cublasSgemm_v2(blas.Value.CublasHandle, transa, transb, m, n, k, ref alpha, aPtrSingle, lda, bPtrSingle, ldb, ref beta, cPtrSingle, ldc); if (_statusF32 != CublasStatus.Success) { throw new CudaBlasException(_statusF32); } } else if (c.ElementType == DType.Float64) { var aPtrDouble = CudaHelpers.GetBufferStart(a); var bPtrDouble = CudaHelpers.GetBufferStart(b); var cPtrDouble = CudaHelpers.GetBufferStart(c); var alphaDouble = (double)alpha; var betaDouble = (double)beta; var _statusF64 = CudaBlasNativeMethods.cublasDgemm_v2(blas.Value.CublasHandle, transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, bPtrDouble, ldb, ref betaDouble, cPtrDouble, ldc); if (_statusF64 != CublasStatus.Success) { throw new CudaBlasException(_statusF64); } } else { throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported"); } } }
private static void GemmOpBatch(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c) { if (a.Strides[1] != 1) { throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})"); } if (b.Strides[1] != 1) { throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)"); } if (c.Strides[1] != 1) { throw new ArgumentException($"c must be contiguous in the first dimension (column major / fortran order) ({a.Strides[0]}, {a.Strides[1]}, {a.Strides[2]}) ({b.Strides[0]}, {b.Strides[1]}, {b.Strides[2]}) ({c.Strides[0]}, {c.Strides[1]}, {c.Strides[2]})"); } using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(c)) { bool nta = transA == BlasOp.NonTranspose; bool ntb = transB == BlasOp.NonTranspose; Operation transa = GetCudaBlasOp(transA); Operation transb = GetCudaBlasOp(transB); int m = (int)a.Sizes[nta ? 1 : 2]; int k = (int)b.Sizes[ntb ? 1 : 2]; int n = (int)b.Sizes[ntb ? 2 : 1]; int lda = (int)a.Strides[2]; int ldb = (int)b.Strides[2]; int ldc = (int)c.Strides[2]; int stra = (int)a.Strides[0]; int strb = (int)b.Strides[0]; int strc = (int)c.Strides[0]; int batchSize = (int)c.Sizes[0]; //// Set the math mode to allow cuBLAS to use Tensor Cores: //cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); CublasStatus status = CudaBlasNativeMethods.cublasSetMathMode(blas.Value.CublasHandle, ManagedCuda.CudaBlas.Math.TensorOpMath); if (status != CublasStatus.Success) { throw new CudaBlasException($"Failed to set math mode to tensor ops."); } if (c.ElementType == DType.Float32) { CUdeviceptr aPtrSingle = CudaHelpers.GetBufferStart(a); CUdeviceptr bPtrSingle = CudaHelpers.GetBufferStart(b); CUdeviceptr cPtrSingle = CudaHelpers.GetBufferStart(c); CublasStatus _statusF32 = CudaBlasNativeMethods.cublasSgemmStridedBatched(blas.Value.CublasHandle, transa, transb, m, n, k, ref alpha, aPtrSingle, lda, stra, bPtrSingle, ldb, strb, ref beta, cPtrSingle, ldc, strc, batchSize); if (_statusF32 != CublasStatus.Success) { throw new CudaBlasException(_statusF32); } } else if (c.ElementType == DType.Float64) { CUdeviceptr aPtrDouble = CudaHelpers.GetBufferStart(a); CUdeviceptr bPtrDouble = CudaHelpers.GetBufferStart(b); CUdeviceptr cPtrDouble = CudaHelpers.GetBufferStart(c); double alphaDouble = alpha; double betaDouble = beta; CublasStatus _statusF64 = CudaBlasNativeMethods.cublasDgemmStridedBatched(blas.Value.CublasHandle, transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, stra, bPtrDouble, ldb, strb, ref betaDouble, cPtrDouble, ldc, strc, batchSize); if (_statusF64 != CublasStatus.Success) { throw new CudaBlasException(_statusF64); } } else { throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported"); } } }