コード例 #1
0
        private static void GemmOp(BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[0] != 1)
            {
                throw new ArgumentException("a must be contiguous in the first dimension (column major / fortran order)");
            }

            if (b.Strides[0] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }

            if (c.Strides[0] != 1)
            {
                throw new ArgumentException("c must be contiguous in the first dimension (column major / fortran order)");
            }

            unsafe
            {
                // dimensons: (m x k) * (k * n) = (m x n)
                bool nta    = transA == BlasOp.NonTranspose;
                bool ntb    = transB == BlasOp.NonTranspose;
                byte transa = (byte)transA;
                byte transb = (byte)transB;
                int  m      = (int)a.Sizes[nta ? 0 : 1];
                int  k      = (int)b.Sizes[ntb ? 0 : 1];
                int  n      = (int)b.Sizes[ntb ? 1 : 0];
                int  lda    = (int)a.Strides[1];
                int  ldb    = (int)b.Strides[1];
                int  ldc    = (int)c.Strides[1];

                if (c.ElementType == DType.Float32)
                {
                    float *aPtrSingle = (float *)CpuNativeHelpers.GetBufferStart(a);
                    float *bPtrSingle = (float *)CpuNativeHelpers.GetBufferStart(b);
                    float *cPtrSingle = (float *)CpuNativeHelpers.GetBufferStart(c);

                    SGEMM sgemm = new SGEMM();
                    sgemm.Run(System.Text.ASCIIEncoding.ASCII.GetString(&transa, 1), System.Text.ASCIIEncoding.ASCII.GetString(&transb, 1), m, n, k, alpha, aPtrSingle, lda, bPtrSingle, ldb, beta, cPtrSingle, ldc);
                }
                else if (c.ElementType == DType.Float64)
                {
                    double *aPtrDouble  = (double *)CpuNativeHelpers.GetBufferStart(a);
                    double *bPtrDouble  = (double *)CpuNativeHelpers.GetBufferStart(b);
                    double *cPtrDouble  = (double *)CpuNativeHelpers.GetBufferStart(c);
                    double  alphaDouble = alpha;
                    double  betaDouble  = beta;
                    OpenBlasNative.dgemm_(&transa, &transb, &m, &n, &k, &alphaDouble, aPtrDouble, &lda, bPtrDouble, &ldb, &betaDouble, cPtrDouble, &ldc);
                }
                else
                {
                    throw new NotSupportedException("CPU GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }
コード例 #2
0
        private static void GemmOp(BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[0] != 1)
            {
                throw new ArgumentException("a must be contiguous in the first dimension (column major / fortran order)");
            }

            if (b.Strides[0] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }

            if (c.Strides[0] != 1)
            {
                throw new ArgumentException("c must be contiguous in the first dimension (column major / fortran order)");
            }

            unsafe
            {
                // dimensons: (m x k) * (k * n) = (m x n)
                var nta    = transA == BlasOp.NonTranspose;
                var ntb    = transB == BlasOp.NonTranspose;
                var transa = (byte)transA;
                var transb = (byte)transB;
                var m      = (int)a.Sizes[nta ? 0 : 1];
                var k      = (int)b.Sizes[ntb ? 0 : 1];
                var n      = (int)b.Sizes[ntb ? 1 : 0];
                var lda    = (int)a.Strides[1];
                var ldb    = (int)b.Strides[1];
                var ldc    = (int)c.Strides[1];

                if (c.ElementType == DType.Float32)
                {
                    var aPtrSingle = (float *)CpuNativeHelpers.GetBufferStart(a);
                    var bPtrSingle = (float *)CpuNativeHelpers.GetBufferStart(b);
                    var cPtrSingle = (float *)CpuNativeHelpers.GetBufferStart(c);

                    OpenBlasNative.sgemm_(&transa, &transb, &m, &n, &k, &alpha, aPtrSingle, &lda, bPtrSingle, &ldb, &beta, cPtrSingle, &ldc);
                }
                else if (c.ElementType == DType.Float64)
                {
                    var    aPtrDouble  = (double *)CpuNativeHelpers.GetBufferStart(a);
                    var    bPtrDouble  = (double *)CpuNativeHelpers.GetBufferStart(b);
                    var    cPtrDouble  = (double *)CpuNativeHelpers.GetBufferStart(c);
                    double alphaDouble = alpha;
                    double betaDouble  = beta;
                    OpenBlasNative.dgemm_(&transa, &transb, &m, &n, &k, &alphaDouble, aPtrDouble, &lda, bPtrDouble, &ldb, &betaDouble, cPtrDouble, &ldc);
                }
                else
                {
                    throw new NotSupportedException("CPU GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }
コード例 #3
0
        private static Operation GetCudaBlasOp(BlasOp op)
        {
            switch (op)
            {
            case BlasOp.NonTranspose: return(Operation.NonTranspose);

            case BlasOp.Transpose: return(Operation.Transpose);

            case BlasOp.ConjugateTranspose: return(Operation.ConjugateTranspose);

            default:
                throw new InvalidOperationException("BlasOp not supported: " + op);
            }
        }
コード例 #4
0
        private static void GemmOp(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[0] != 1)
            {
                throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})");
            }
            if (b.Strides[0] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }
            if (c.Strides[0] != 1)
            {
                throw new ArgumentException("c must be contiguous in the first dimension (column major / fortran order)");
            }

            using (var blas = context.BlasForTensor(c))
            {
                bool      nta    = transA == BlasOp.NonTranspose;
                bool      ntb    = transB == BlasOp.NonTranspose;
                Operation transa = GetCudaBlasOp(transA);
                Operation transb = GetCudaBlasOp(transB);
                int       m      = (int)a.Sizes[nta ? 0 : 1];
                int       k      = (int)b.Sizes[ntb ? 0 : 1];
                int       n      = (int)b.Sizes[ntb ? 1 : 0];
                int       lda    = (int)a.Strides[1];
                int       ldb    = (int)b.Strides[1];
                int       ldc    = (int)c.Strides[1];



                if (c.ElementType == DType.Float32)
                {
                    var aPtrSingle = CudaHelpers.GetBufferStart(a);
                    var bPtrSingle = CudaHelpers.GetBufferStart(b);
                    var cPtrSingle = CudaHelpers.GetBufferStart(c);

                    var _statusF32 = CudaBlasNativeMethods.cublasSgemm_v2(blas.Value.CublasHandle,
                                                                          transa, transb, m, n, k, ref alpha, aPtrSingle, lda, bPtrSingle, ldb, ref beta, cPtrSingle, ldc);
                    if (_statusF32 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF32);
                    }
                }
                else if (c.ElementType == DType.Float64)
                {
                    var aPtrDouble  = CudaHelpers.GetBufferStart(a);
                    var bPtrDouble  = CudaHelpers.GetBufferStart(b);
                    var cPtrDouble  = CudaHelpers.GetBufferStart(c);
                    var alphaDouble = (double)alpha;
                    var betaDouble  = (double)beta;
                    var _statusF64  = CudaBlasNativeMethods.cublasDgemm_v2(blas.Value.CublasHandle,
                                                                           transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, bPtrDouble, ldb, ref betaDouble, cPtrDouble, ldc);
                    if (_statusF64 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF64);
                    }
                }
                else
                {
                    throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }
コード例 #5
0
        // Computes  c := alpha * a * b  +  beta * c
        public static void Gemm(TSCudaContext context, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Sizes[0] != c.Sizes[0] || b.Sizes[1] != c.Sizes[1] || a.Sizes[1] != b.Sizes[0])
            {
                throw new InvalidOperationException("Size mismatch");
            }

            BlasOp aOp   = default(BlasOp);
            BlasOp bOp   = default(BlasOp);
            bool   copyC = false;

            Tensor aClone = null;
            Tensor bClone = null;
            Tensor cClone = null;


            if (c.Strides[0] == 1 &&
                c.Strides[1] != 0 && c.Strides[1] != 1)
            {
                // If c is contiguous in dimension 0 (column-major)
                aClone = a.CopyRef();
                bClone = b.CopyRef();
                cClone = c.CopyRef();
            }
            else if (c.Strides[1] == 1 &&
                     c.Strides[0] != 0 && c.Strides[0] != 1)
            {
                // If c is contiguous in dimension 1 (row-major)
                // using (a * b)' == b' * a'
                // we can pass row-major matrices to BLAS functions that expect column-major by swapping A and B,
                // and transposing all 3 matrices

                cClone = c.Transpose();
                aClone = b.Transpose(); // Note swap of a and b
                bClone = a.Transpose();
            }
            else
            {
                var cNew = new Tensor(c.Allocator, c.ElementType, c.Sizes[1], c.Sizes[0]);
                cClone = cNew.Transpose();
                Ops.Copy(cClone, c);
                cNew.Dispose();
                copyC = true;

                aClone = a.CopyRef();
                bClone = b.CopyRef();
            }

            try
            {
                if (aClone.Strides[0] == 1 &&
                    aClone.Strides[1] != 0 && aClone.Strides[1] != 1)
                {
                    // If a is contiguous in dimension 0 (column-major)
                    aOp = BlasOp.NonTranspose;
                }
                else if (aClone.Strides[1] == 1 &&
                         aClone.Strides[0] != 0 && aClone.Strides[0] != 1)
                {
                    aOp = BlasOp.Transpose;
                    var aNew = aClone.Transpose();
                    aClone.Dispose();
                    aClone = aNew;
                }
                else
                {
                    var aNew    = new Tensor(aClone.Allocator, aClone.ElementType, aClone.Sizes[1], aClone.Sizes[0]);
                    var aClone2 = aNew.Transpose();
                    Ops.Copy(aClone2, aClone);
                    aClone.Dispose();
                    aClone = aClone2;
                    aNew.Dispose();

                    aOp = BlasOp.NonTranspose;
                }

                if (bClone.Strides[0] == 1 &&
                    bClone.Strides[1] != 0 && bClone.Strides[1] != 1)
                {
                    // If a is contiguous in dimension 0 (column-major)
                    bOp = BlasOp.NonTranspose;
                }
                else if (bClone.Strides[1] == 1 &&
                         bClone.Strides[0] != 0 && bClone.Strides[0] != 1)
                {
                    bOp = BlasOp.Transpose;
                    var bNew = bClone.Transpose();
                    bClone.Dispose();
                    bClone = bNew;
                }
                else
                {
                    var bNew    = new Tensor(bClone.Allocator, bClone.ElementType, bClone.Sizes[1], bClone.Sizes[0]);
                    var bClone2 = bNew.Transpose();
                    Ops.Copy(bClone2, bClone);
                    bClone.Dispose();
                    bClone = bClone2;
                    bNew.Dispose();

                    bOp = BlasOp.NonTranspose;
                }

                GemmOp(context, aOp, bOp, alpha, aClone, bClone, beta, cClone);

                if (copyC)
                {
                    Ops.Copy(c, cClone);
                }
            }
            finally
            {
                aClone.Dispose();
                bClone.Dispose();
                cClone.Dispose();
            }
        }
コード例 #6
0
        private static void GemmOpBatch(TSCudaContext context, BlasOp transA, BlasOp transB, float alpha, Tensor a, Tensor b, float beta, Tensor c)
        {
            if (a.Strides[1] != 1)
            {
                throw new ArgumentException($"a must be contiguous in the first dimension (column major / fortran order). ({a.Strides[0]},{a.Strides[1]}) ({b.Strides[0]},{b.Strides[1]}) ({c.Strides[0]},{c.Strides[1]})");
            }

            if (b.Strides[1] != 1)
            {
                throw new ArgumentException("b must be contiguous in the first dimension (column major / fortran order)");
            }

            if (c.Strides[1] != 1)
            {
                throw new ArgumentException($"c must be contiguous in the first dimension (column major / fortran order) ({a.Strides[0]}, {a.Strides[1]}, {a.Strides[2]}) ({b.Strides[0]}, {b.Strides[1]}, {b.Strides[2]}) ({c.Strides[0]}, {c.Strides[1]}, {c.Strides[2]})");
            }

            using (Util.PooledObject <CudaBlas> blas = context.BlasForTensor(c))
            {
                bool      nta    = transA == BlasOp.NonTranspose;
                bool      ntb    = transB == BlasOp.NonTranspose;
                Operation transa = GetCudaBlasOp(transA);
                Operation transb = GetCudaBlasOp(transB);
                int       m      = (int)a.Sizes[nta ? 1 : 2];
                int       k      = (int)b.Sizes[ntb ? 1 : 2];
                int       n      = (int)b.Sizes[ntb ? 2 : 1];
                int       lda    = (int)a.Strides[2];
                int       ldb    = (int)b.Strides[2];
                int       ldc    = (int)c.Strides[2];

                int stra      = (int)a.Strides[0];
                int strb      = (int)b.Strides[0];
                int strc      = (int)c.Strides[0];
                int batchSize = (int)c.Sizes[0];


                //// Set the math mode to allow cuBLAS to use Tensor Cores:
                //cublasStat = cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);

                CublasStatus status = CudaBlasNativeMethods.cublasSetMathMode(blas.Value.CublasHandle, ManagedCuda.CudaBlas.Math.TensorOpMath);
                if (status != CublasStatus.Success)
                {
                    throw new CudaBlasException($"Failed to set math mode to tensor ops.");
                }


                if (c.ElementType == DType.Float32)
                {
                    CUdeviceptr aPtrSingle = CudaHelpers.GetBufferStart(a);
                    CUdeviceptr bPtrSingle = CudaHelpers.GetBufferStart(b);
                    CUdeviceptr cPtrSingle = CudaHelpers.GetBufferStart(c);

                    CublasStatus _statusF32 = CudaBlasNativeMethods.cublasSgemmStridedBatched(blas.Value.CublasHandle,
                                                                                              transa, transb, m, n, k, ref alpha, aPtrSingle, lda, stra, bPtrSingle, ldb, strb, ref beta, cPtrSingle, ldc, strc, batchSize);
                    if (_statusF32 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF32);
                    }
                }
                else if (c.ElementType == DType.Float64)
                {
                    CUdeviceptr  aPtrDouble  = CudaHelpers.GetBufferStart(a);
                    CUdeviceptr  bPtrDouble  = CudaHelpers.GetBufferStart(b);
                    CUdeviceptr  cPtrDouble  = CudaHelpers.GetBufferStart(c);
                    double       alphaDouble = alpha;
                    double       betaDouble  = beta;
                    CublasStatus _statusF64  = CudaBlasNativeMethods.cublasDgemmStridedBatched(blas.Value.CublasHandle,
                                                                                               transa, transb, m, n, k, ref alphaDouble, aPtrDouble, lda, stra, bPtrDouble, ldb, strb, ref betaDouble, cPtrDouble, ldc, strc, batchSize);
                    if (_statusF64 != CublasStatus.Success)
                    {
                        throw new CudaBlasException(_statusF64);
                    }
                }
                else
                {
                    throw new NotSupportedException("CUDA GEMM with element type " + c.ElementType + " not supported");
                }
            }
        }