private static void Run_M_V_float(TSCudaContext context, Tensor result, Tensor mat, Tensor vec) { // Require lhs to be row-major. This means we must tell BLAS to transpose it (BLAS expects column-major matrices) if (mat.Strides[1] != 1) { throw new ArgumentException("lhs must be contiguous in the last dimension"); } using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(mat)) { ManagedCuda.BasicTypes.CUdeviceptr yPtr = CudaHelpers.GetBufferStart(result); ManagedCuda.BasicTypes.CUdeviceptr aPtr = CudaHelpers.GetBufferStart(mat); ManagedCuda.BasicTypes.CUdeviceptr xPtr = CudaHelpers.GetBufferStart(vec); Operation trans = Operation.Transpose; int m = (int)mat.Sizes[1]; int n = (int)mat.Sizes[0]; int incx = (int)vec.Strides[0]; int lda = (int)mat.Strides[0]; int incy = (int)result.Strides[0]; float alpha = 1; float beta = 0; CudaBlasNativeMethods.cublasSgemv_v2(blas.Value.CublasHandle, trans, m, n, ref alpha, aPtr, lda, xPtr, incx, ref beta, yPtr, incy); } }
private static void Run_Dot_float(TSCudaContext context, Tensor result, Tensor lhs, Tensor rhs) { using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(lhs)) { //var resultPtr = CudaNativeHelpers.GetBufferStart(result); CUdeviceptr lhsPtr = CudaHelpers.GetBufferStart(lhs); CUdeviceptr rhsPtr = CudaHelpers.GetBufferStart(rhs); int n = (int)lhs.Sizes[0]; int incx = (int)lhs.Strides[0]; int incy = (int)rhs.Strides[0]; float resultVal = 0; CublasStatus _status = CudaBlasNativeMethods.cublasSdot_v2(blas.Value.CublasHandle, n, lhsPtr, incx, rhsPtr, incy, ref resultVal); if (_status != CublasStatus.Success) { throw new CudaBlasException(_status); } result.Storage.SetElementAsFloat(result.StorageOffset, resultVal); } }
private static void GemmOpBatch(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c) { if (a.Strides[1] != 1) { throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})"); } if (b.Strides[1] != 1) { throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)"); } if (c.Strides[1] != 1) { throw new ArgumentException($"c must be contiguous in the first dimension (column major / fortran order) ({a.Strides[0]}, {a.Strides[1]}, {a.Strides[2]}) ({b.Strides[0]}, {b.Strides[1]}, {b.Strides[2]}) ({c.Strides[0]}, {c.Strides[1]}, {c.Strides[2]})"); } using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(c)) { bool nta = transA == BlasOp.NonTranspose; bool ntb = transB == BlasOp.NonTranspose; Operation transa = GetCudaBlasOp(transA); Operation transb = GetCudaBlasOp(transB); int m = (int)a.Sizes[nta ? 1 : 2]; int k = (int)b.Sizes[ntb ? 1 : 2]; int n = (int)b.Sizes[ntb ? 2 : 1]; int lda = (int)a.Strides[2]; int ldb = (int)b.Strides[2]; int ldc = (int)c.Strides[2]; int stra = (int)a.Strides[0]; int strb = (int)b.Strides[0]; int strc = (int)c.Strides[0]; int batchSize = (int)c.Sizes[0]; if (c.ElementType == DType.Float32) { CUdeviceptr aPtrSingle = CudaHelpers.GetBufferStart(a); CUdeviceptr bPtrSingle = CudaHelpers.GetBufferStart(b); CUdeviceptr cPtrSingle = CudaHelpers.GetBufferStart(c); CublasStatus _statusF32 = CudaBlasNativeMethods.cublasSgemmStridedBatched(blas.Value.CublasHandle, transa, transb, m, n, k, ref alpha, aPtrSingle, lda, stra, bPtrSingle, ldb, strb, ref beta, cPtrSingle, ldc, strc, batchSize); if (_statusF32 != CublasStatus.Success) { throw new CudaBlasException(_statusF32); } } else if (c.ElementType == DType.Float64) { CUdeviceptr aPtrDouble = CudaHelpers.GetBufferStart(a); CUdeviceptr bPtrDouble = CudaHelpers.GetBufferStart(b); CUdeviceptr cPtrDouble = CudaHelpers.GetBufferStart(c); double alphaDouble = alpha; double betaDouble = beta; CublasStatus _statusF64 = CudaBlasNativeMethods.cublasDgemmStridedBatched(blas.Value.CublasHandle, transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, stra, bPtrDouble, ldb, strb, ref betaDouble, cPtrDouble, ldc, strc, batchSize); if (_statusF64 != CublasStatus.Success) { throw new CudaBlasException(_statusF64); } } else { throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported"); } } }