public static float[] RunMatrixMulShared(float[][] a, float[][] b, int N, ref Stopwatch sw) { //Create context and accelerator var gpu = new CudaAccelerator(new Context()); //Create typed launcher var matrixMulKernelShared = gpu.LoadStreamKernel < ArrayView <float>, ArrayView <float>, ArrayView <float>, int>(MatrixMulShared); //Allocate memory var buffSize = N * N; MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize); MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize); d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize); d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize); //Groups per grid dimension int GrPerDim = (int)Math.Ceiling((float)N / groupSize); KernelConfig dimension = ( new Index2(GrPerDim, GrPerDim), // Number of groups new Index2(groupSize, groupSize)); // Group size (thread count in group) sw.Restart(); matrixMulKernelShared(dimension, d_a.View, d_b.View, d_c.View, N); // Wait for the kernel to finish... gpu.Synchronize(); sw.Stop(); var c = d_c.GetAsArray(); return(c); }