Beispiel #1
0
        private static void GemmOpBatch(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[1] != 1)
            {
                throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})");
            }

            if (b.Strides[1] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }

            if (c.Strides[1] != 1)
            {
                throw new ArgumentException($"c must be contiguous in the first dimension (column major / fortran order) ({a.Strides[0]}, {a.Strides[1]}, {a.Strides[2]}) ({b.Strides[0]}, {b.Strides[1]}, {b.Strides[2]}) ({c.Strides[0]}, {c.Strides[1]}, {c.Strides[2]})");
            }

            using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(c))
            {
                bool      nta    = transA == BlasOp.NonTranspose;
                bool      ntb    = transB == BlasOp.NonTranspose;
                Operation transa = GetCudaBlasOp(transA);
                Operation transb = GetCudaBlasOp(transB);
                int       m      = (int)a.Sizes[nta ? 1 : 2];
                int       k      = (int)b.Sizes[ntb ? 1 : 2];
                int       n      = (int)b.Sizes[ntb ? 2 : 1];
                int       lda    = (int)a.Strides[2];
                int       ldb    = (int)b.Strides[2];
                int       ldc    = (int)c.Strides[2];

                int stra      = (int)a.Strides[0];
                int strb      = (int)b.Strides[0];
                int strc      = (int)c.Strides[0];
                int batchSize = (int)c.Sizes[0];


                //// Set the math mode to allow cuBLAS to use Tensor Cores:
                //cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);

                CublasStatus status = CudaBlasNativeMethods.cublasSetMathMode(blas.Value.CublasHandle, ManagedCuda.CudaBlas.Math.TensorOpMath);
                if (status != CublasStatus.Success)
                {
                    throw new CudaBlasException($"Failed to set math mode to tensor ops.");
                }


                if (c.ElementType == DType.Float32)
                {
                    CUdeviceptr aPtrSingle = CudaHelpers.GetBufferStart(a);
                    CUdeviceptr bPtrSingle = CudaHelpers.GetBufferStart(b);
                    CUdeviceptr cPtrSingle = CudaHelpers.GetBufferStart(c);

                    CublasStatus _statusF32 = CudaBlasNativeMethods.cublasSgemmStridedBatched(blas.Value.CublasHandle,
                                                                                              transa, transb, m, n, k, ref alpha, aPtrSingle, lda, stra, bPtrSingle, ldb, strb, ref beta, cPtrSingle, ldc, strc, batchSize);
                    if (_statusF32 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF32);
                    }
                }
                else if (c.ElementType == DType.Float64)
                {
                    CUdeviceptr  aPtrDouble  = CudaHelpers.GetBufferStart(a);
                    CUdeviceptr  bPtrDouble  = CudaHelpers.GetBufferStart(b);
                    CUdeviceptr  cPtrDouble  = CudaHelpers.GetBufferStart(c);
                    double       alphaDouble = alpha;
                    double       betaDouble  = beta;
                    CublasStatus _statusF64  = CudaBlasNativeMethods.cublasDgemmStridedBatched(blas.Value.CublasHandle,
                                                                                               transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, stra, bPtrDouble, ldb, strb, ref betaDouble, cPtrDouble, ldc, strc, batchSize);
                    if (_statusF64 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF64);
                    }
                }
                else
                {
                    throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }
Beispiel #2
0
        private static void GemmOp(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[0] != 1)
            {
                throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})");
            }

            if (b.Strides[0] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }

            if (c.Strides[0] != 1)
            {
                throw new ArgumentException("c must be contiguous in the first dimension (column major / fortran order)");
            }

            using var blas = context.BlasForTensor(c);
            var nta    = transA == BlasOp.NonTranspose;
            var ntb    = transB == BlasOp.NonTranspose;
            var transa = GetCudaBlasOp(transA);
            var transb = GetCudaBlasOp(transB);
            var m      = (int)a.Sizes[nta ? 0 : 1];
            var k      = (int)b.Sizes[ntb ? 0 : 1];
            var n      = (int)b.Sizes[ntb ? 1 : 0];
            var lda    = (int)a.Strides[1];
            var ldb    = (int)b.Strides[1];
            var ldc    = (int)c.Strides[1];

            var status = CudaBlasNativeMethods.cublasSetMathMode(blas.Value.CublasHandle, ManagedCuda.CudaBlas.Math.TensorOpMath);

            if (status != CublasStatus.Success)
            {
                throw new CudaBlasException($"Failed to set math mode to tensor ops.");
            }

            if (c.ElementType == DType.Float32)
            {
                var aPtrSingle = CudaHelpers.GetBufferStart(a);
                var bPtrSingle = CudaHelpers.GetBufferStart(b);
                var cPtrSingle = CudaHelpers.GetBufferStart(c);

                var _statusF32 = CudaBlasNativeMethods.cublasSgemm_v2(blas.Value.CublasHandle,
                                                                      transa, transb, m, n, k, ref alpha, aPtrSingle, lda, bPtrSingle, ldb, ref beta, cPtrSingle, ldc);
                if (_statusF32 != CublasStatus.Success)
                {
                    throw new CudaBlasException(_statusF32);
                }
            }
            else if (c.ElementType == DType.Float64)
            {
                var    aPtrDouble  = CudaHelpers.GetBufferStart(a);
                var    bPtrDouble  = CudaHelpers.GetBufferStart(b);
                var    cPtrDouble  = CudaHelpers.GetBufferStart(c);
                double alphaDouble = alpha;
                double betaDouble  = beta;
                var    _statusF64  = CudaBlasNativeMethods.cublasDgemm_v2(blas.Value.CublasHandle,
                                                                          transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, bPtrDouble, ldb, ref betaDouble, cPtrDouble, ldc);
                if (_statusF64 != CublasStatus.Success)
                {
                    throw new CudaBlasException(_statusF64);
                }
            }
            else
            {
                throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported");
            }
        }