public static void MyFirstBlasEmulatorTest() { Console.WriteLine("MyTest()"); // Get GPU device CudafyModes.Target = eGPUType.Emulator; GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target); // Create GPGPUBLAS (CUBLAS Wrapper) using (GPGPUBLAS blas = GPGPUBLAS.Create(gpu)) { const int N = 100; float[] a = new float[N]; float[] b = new float[N]; float[] c = new float[N]; float alpha = -1; float beta = 0; float[] device_a = gpu.CopyToDevice(a); float[] device_b = gpu.CopyToDevice(b); float[] device_c = gpu.CopyToDevice(c); int m = 10; int n = 10; int k = 10; cublasOperation Op = cublasOperation.N; blas.GEMM(m, k, n, alpha, device_a, device_b, beta, device_c, Op); gpu.CopyFromDevice <float>(device_c, c); } }
// // http://stackoverflow.com/questions/18628447/cudafy-throws-an-exception-while-testing // private static void BlasSample(int deviceId) { CudafyModes.Target = eGPUType.Emulator; GPGPU gpu = CudafyHost.GetDevice(CudafyModes.Target, deviceId); CudafyModes.DeviceId = deviceId; eArchitecture arch = gpu.GetArchitecture(); CudafyModule km = CudafyTranslator.Cudafy(arch); gpu.LoadModule(km); GPGPUBLAS blas = GPGPUBLAS.Create(gpu); const int N = 100; float[] a = new float[N]; float[] b = new float[N]; float[] c = new float[N]; float alpha = -1; float beta = 0; float[] device_a = gpu.CopyToDevice(a); float[] device_b = gpu.CopyToDevice(b); float[] device_c = gpu.CopyToDevice(c); int m = 10; int n = 10; int k = 10; cublasOperation Op = cublasOperation.N; blas.GEMM(m, k, n, alpha, device_a, device_b, beta, device_c, Op); throw new NotImplementedException(); }
//http://peterwittek.com/2013/06/cublas-matrix-c-style/ //row major to col major "trick" public void GemmRowMajor(CpuGpuArray A, CpuGpuArray B, CpuGpuArray C, float cMultiplier = 0f, bool transposeA = false, bool transposeB = false) { var blasA = B; var blasB = A; // M = rowcount A(T),C // N = colcount B(T),C // K = colcount A(T), rowcount B(T) // However, we flip so // M = colcount B(T),C // N = rowcoun A(T), C // K = cols A(T), rows B var m = blasA.ColCount; var n = blasB.RowCount; var k = blasB.ColCount; var lda = blasA.ColCount; var ldb = blasB.ColCount; var ldc = blasA.ColCount; var transb = cublasOperation.N; if (transposeA) { transb = cublasOperation.T; n = blasB.ColCount; k = blasB.RowCount; } var transa = cublasOperation.N; if (transposeB) { transa = cublasOperation.T; m = blasA.RowCount; ldc = blasA.RowCount; } Blas.GEMM(m, k, n, 1f, B.GPUArray, A.GPUArray, cMultiplier, C.GPUArray, lda: lda, ldb: ldb, ldc: ldc, transb: transb, transa: transa); }
public static void cudaTransposeAndMultiply(ref MathNet.Numerics.LinearAlgebra.Double.DenseMatrix dm) { Cudafy.CudafyModule km = Cudafy.Translator.CudafyTranslator.Cudafy(); km.Serialize(); GPGPU gpu = CudafyHost.GetDevice(eGPUType.Cuda); int cols = dm.ColumnCount, rows = dm.RowCount; dm.Storage.ToColumnMajorArray(); double[] a = dm.ToColumnWiseArray(); dm = new MathNet.Numerics.LinearAlgebra.Double.DenseMatrix(1, 1); double[] dev_a = gpu.Allocate <double>(a.Length); GPGPUBLAS blas = GPGPUBLAS.Create(gpu); double[] a_d = gpu.CopyToDevice <double>(a); double[] c_d = gpu.Allocate <double>(cols * cols); gpu.StartTimer(); blas.GEMM(cols, rows, cols, 1, a_d, a_d, 0, c_d, Cudafy.Maths.BLAS.Types.cublasOperation.T); a = new double[cols * cols]; gpu.CopyFromDevice <double>(c_d, a); gpu.FreeAll(); dm = new MathNet.Numerics.LinearAlgebra.Double.DenseMatrix(cols, cols, a); }
public void Test_BLAS3_GEMM() { // A : No transpose, B : No transpose ClearBuffer(hiMatrixAMK); ClearBuffer(hiMatrixBKN); ClearBuffer(hiMatrixCMN); FillBuffer(hiMatrixAMK); FillBuffer(hiMatrixBKN); FillBuffer(hiMatrixCMN); diMatrixA = _gpu.CopyToDevice(hiMatrixAMK); diMatrixB = _gpu.CopyToDevice(hiMatrixBKN); diMatrixC = _gpu.CopyToDevice(hiMatrixCMN); _blas.GEMM(M, K, N, Alpha, diMatrixA, diMatrixB, Beta, diMatrixC); _gpu.CopyFromDevice(diMatrixC, gpuResultMN); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { double cpuResult = 0.0; for (int k = 0; k < K; k++) { cpuResult += Alpha * hiMatrixAMK[GetIndexColumnMajor(i, k, M)] * hiMatrixBKN[GetIndexColumnMajor(k, j, K)]; } cpuResult += Beta * hiMatrixCMN[GetIndexColumnMajor(i, j, M)]; Assert.AreEqual(cpuResult, gpuResultMN[GetIndexColumnMajor(i, j, M)]); } } _gpu.FreeAll(); // A : Transpose, B : No transpose ClearBuffer(hiMatrixAKM); ClearBuffer(hiMatrixBKN); ClearBuffer(hiMatrixCMN); FillBuffer(hiMatrixAKM); FillBuffer(hiMatrixBKN); FillBuffer(hiMatrixCMN); diMatrixA = _gpu.CopyToDevice(hiMatrixAKM); diMatrixB = _gpu.CopyToDevice(hiMatrixBKN); diMatrixC = _gpu.CopyToDevice(hiMatrixCMN); _blas.GEMM(M, K, N, Alpha, diMatrixA, diMatrixB, Beta, diMatrixC, cublasOperation.T); _gpu.CopyFromDevice(diMatrixC, gpuResultMN); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { double cpuResult = 0.0; for (int k = 0; k < K; k++) { cpuResult += Alpha * hiMatrixAKM[GetIndexColumnMajor(k, i, K)] * hiMatrixBKN[GetIndexColumnMajor(k, j, K)]; } cpuResult += Beta * hiMatrixCMN[GetIndexColumnMajor(i, j, M)]; Assert.AreEqual(cpuResult, gpuResultMN[GetIndexColumnMajor(i, j, M)]); } } _gpu.FreeAll(); // A : No transpose, B : Transpose ClearBuffer(hiMatrixAMK); ClearBuffer(hiMatrixBNK); ClearBuffer(hiMatrixCMN); FillBuffer(hiMatrixAMK); FillBuffer(hiMatrixBNK); FillBuffer(hiMatrixCMN); diMatrixA = _gpu.CopyToDevice(hiMatrixAMK); diMatrixB = _gpu.CopyToDevice(hiMatrixBNK); diMatrixC = _gpu.CopyToDevice(hiMatrixCMN); _blas.GEMM(M, K, N, Alpha, diMatrixA, diMatrixB, Beta, diMatrixC, cublasOperation.N, cublasOperation.T); _gpu.CopyFromDevice(diMatrixC, gpuResultMN); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { double cpuResult = 0.0; for (int k = 0; k < K; k++) { cpuResult += Alpha * hiMatrixAMK[GetIndexColumnMajor(i, k, M)] * hiMatrixBNK[GetIndexColumnMajor(j, k, N)]; } cpuResult += Beta * hiMatrixCMN[GetIndexColumnMajor(i, j, M)]; Assert.AreEqual(cpuResult, gpuResultMN[GetIndexColumnMajor(i, j, M)]); } } _gpu.FreeAll(); // A : Transpose, B : Transpose ClearBuffer(hiMatrixAKM); ClearBuffer(hiMatrixBNK); ClearBuffer(hiMatrixCMN); FillBuffer(hiMatrixAKM); FillBuffer(hiMatrixBNK); FillBuffer(hiMatrixCMN); diMatrixA = _gpu.CopyToDevice(hiMatrixAKM); diMatrixB = _gpu.CopyToDevice(hiMatrixBNK); diMatrixC = _gpu.CopyToDevice(hiMatrixCMN); _blas.GEMM(M, K, N, Alpha, diMatrixA, diMatrixB, Beta, diMatrixC, cublasOperation.T, cublasOperation.T); _gpu.CopyFromDevice(diMatrixC, gpuResultMN); for (int i = 0; i < M; i++) { for (int j = 0; j < N; j++) { double cpuResult = 0.0; for (int k = 0; k < K; k++) { cpuResult += Alpha * hiMatrixAKM[GetIndexColumnMajor(k, i, K)] * hiMatrixBNK[GetIndexColumnMajor(j, k, N)]; } cpuResult += Beta * hiMatrixCMN[GetIndexColumnMajor(i, j, M)]; Assert.AreEqual(cpuResult, gpuResultMN[GetIndexColumnMajor(i, j, M)]); } } _gpu.FreeAll(); }