static void Main(string[] args) { if (args.Length == 0) { args = new string[] { "512", "512", "512", "512" }; } const int redo = 10; int heightA = Convert.ToInt32(args[0]); int widthA = Convert.ToInt32(args[1]); int heightB = Convert.ToInt32(args[2]); int widthB = Convert.ToInt32(args[3]); if (widthA != heightB) { throw new ArgumentException("invalid data -- incompatible matrices"); } Console.WriteLine("Execution Naive matrix mul with sizes ({0}, {1}) x ({2}, {3})", heightA, widthA, heightB, widthB); NaiveMatrix matrixA = new NaiveMatrix(widthA, heightA); NaiveMatrix matrixB = new NaiveMatrix(widthB, heightB); NaiveMatrix res_net = new NaiveMatrix(widthB, heightA); NaiveMatrix res_cuda = new NaiveMatrix(widthB, heightA); double numberCompute = ((double)matrixA.Height * (double)matrixA.Width * (double)matrixB.Width) * 3.0E-9; matrixA.FillMatrix(); matrixB.FillMatrix(); Random rand = new Random(); #region CUDA HybRunner runner = HybRunner.Cuda().SetDistrib(4, 5, 8, 32, 32, 0); dynamic wrapper = runner.Wrap(new Program()); for (int i = 0; i < redo; ++i) { wrapper.ComputeRowsOfProduct(res_cuda, matrixA, matrixB, 0, res_cuda.Height); } #endregion #region C# for (int i = 0; i < redo; ++i) { Parallel.For(0, res_net.Height, (line) => { ComputeRowsOfProduct(res_net, matrixA, matrixB, line, line + 1); }); } #endregion Console.Out.WriteLine("DONE"); }
public static void displayMatrix(NaiveMatrix M) { for (int i = 0; i < M.Width; ++i) { for (int j = 0; j < M.Width; ++j) { Console.Write(M[i * M.Width + j] + ", "); } Console.WriteLine(); } }
public static void reference(NaiveMatrix A, NaiveMatrix B, NaiveMatrix res, int N) { for (int i = 0; i < N; ++i) { for (int j = 0; j < N; ++j) { float tmp = 0.0F; for (int k = 0; k < N; ++k) { tmp += A.Values[i * N + k] * B.Values[k * N + j]; } res.Values[i * N + j] = tmp; } } }
public static void Reference(NaiveMatrix result, NaiveMatrix A, NaiveMatrix B) { Parallel.For(0, A.Height, (i) => { for (int j = 0; j < B.Width; ++j) { float accum = 0.0F; for (int k = 0; k < A.Width; ++k) { accum += A[A.Width * i + k] * B[B.Width * k + j]; } result[B.Width * i + j] = accum; } }); }
static void Main(string[] args) { if (args.Length == 0) { args = new string[] { "512", "512", "512", "512" }; } const int redo = 10; int heightA = Convert.ToInt32(args[0]); int widthA = Convert.ToInt32(args[1]); int heightB = Convert.ToInt32(args[2]); int widthB = Convert.ToInt32(args[3]); if (widthA != heightB) { throw new ArgumentException("invalid data -- incompatible matrices"); } Console.WriteLine("Execution Naive matrix mul with sizes ({0}, {1}) x ({2}, {3})", heightA, widthA, heightB, widthB); NaiveMatrix matrixA = new NaiveMatrix(widthA, heightA); NaiveMatrix matrixB = new NaiveMatrix(widthB, heightB); NaiveMatrix res_net = new NaiveMatrix(widthB, heightA); NaiveMatrix res_cuda = new NaiveMatrix(widthB, heightA); double numberCompute = ((double)matrixA.Height * (double)matrixA.Width * (double)matrixB.Width) * 3.0E-9; matrixA.FillMatrix(); matrixB.FillMatrix(); #region CUDA HybRunner runner = HybRunner.Cuda("SharedMatrix_CUDA.dll").SetDistrib(4, 5, 32, 32, 1, 1024 * 2 * 8); dynamic wrapper = runner.Wrap(new Program()); for (int i = 0; i < redo; ++i) { wrapper.Multiply(res_cuda, matrixA, matrixB, matrixA.Width); } #endregion #region C# Reference(res_net, matrixA, matrixB); #endregion Console.Out.WriteLine("DONE"); }
public static void ComputeRowsOfProduct(NaiveMatrix resultMatrix, NaiveMatrix matrixA, NaiveMatrix matrixB, int lineFrom, int lineTo) { int commonSize = matrixA.Width; int bWidth = matrixB.Width; for (int i = lineFrom + threadIdx.y + blockIdx.y * blockDim.y; i < lineTo; i += blockDim.y * gridDim.y) { for (int j = threadIdx.x + blockIdx.x * blockDim.x; j < bWidth; j += blockDim.x * gridDim.x) { resultMatrix[i * bWidth + j] = 0.0f; for (int k = 0; k < commonSize; ++k) { resultMatrix[i * bWidth + j] += (matrixA[i * commonSize + k] * matrixB[k * bWidth + j]); } } } }
static void Main(string[] args) { const int N = 1024; Console.WriteLine("Execution cublas matrix mul with sizes ({0}, {1}) x ({2}, {3})", N, N, N, N); NaiveMatrix matrixA = new NaiveMatrix(N, N); NaiveMatrix matrixB = new NaiveMatrix(N, N); NaiveMatrix res = new NaiveMatrix(N, N); NaiveMatrix res_net = new NaiveMatrix(N, N); matrixA.FillMatrix(); matrixB.FillMatrix(); float alpha = 1.0f; float beta = 0.0f; cublas cublas = new cublas(); cublasHandle_t handle; cublas.Create(out handle); cublasOperation_t transA = cublasOperation_t.CUBLAS_OP_N; cublasOperation_t transB = cublasOperation_t.CUBLAS_OP_N; cublasSgemm(handle, transA, transB, N, N, N, &alpha, matrixA.Values, N, matrixB.Values, N, &beta, res.Values, N); cublas.Destroy(handle); reference(matrixA, matrixB, res_net, N); for (int i = 0; i < N * N; ++i) { if (Math.Abs(res[i] - res_net[i]) >= 1.0E-3) { Console.WriteLine("Error at {0}, expected {1}, got {2}", i, res_net[i], res[i]); Environment.Exit(1); } } Console.Out.WriteLine("DONE"); }
public static void Multiply(NaiveMatrix result, NaiveMatrix A, NaiveMatrix B, int size) { SharedMemoryAllocator <float> allocator = new SharedMemoryAllocator <float>(); float[] cacheA = allocator.allocate(blockDim.y * blockDim.x); float[] cacheB = allocator.allocate(blockDim.y * blockDim.x); for (int by = blockIdx.y; by < size / blockDim.y; by += gridDim.y) { for (int bx = blockIdx.x; bx < size / blockDim.x; bx += gridDim.x) { int tx = threadIdx.x, ty = threadIdx.y; int i = by * blockDim.y + ty; int j = bx * blockDim.x + tx; if (i >= size || j >= size) { return; } float Pvalue = 0; for (int blockIdread = 0; blockIdread < size / blockDim.x; ++blockIdread) { cacheA[ty * blockDim.y + tx] = A[i * size + (blockIdread * blockDim.x + tx)]; cacheB[ty * blockDim.y + tx] = B[(blockIdread * blockDim.x + ty) * size + j]; SyncThreads(); for (int k = 0; k < blockDim.x; ++k) { Pvalue += cacheA[ty * blockDim.x + k] * cacheB[k * blockDim.x + tx]; } SyncThreads(); } result[i * size + j] = Pvalue; } } }