Example #1
0
        public static float[] RunMatrixMulShared(float[][] a, float[][] b, int N, ref Stopwatch sw)
        {
            //Create context and accelerator
            var gpu = new CudaAccelerator(new Context());

            //Create typed launcher
            var matrixMulKernelShared = gpu.LoadStreamKernel <
                ArrayView <float>,
                ArrayView <float>,
                ArrayView <float>,
                int>(MatrixMulShared);

            //Allocate memory
            var buffSize             = N * N;
            MemoryBuffer <float> d_a = gpu.Allocate <float>(buffSize);
            MemoryBuffer <float> d_b = gpu.Allocate <float>(buffSize);
            MemoryBuffer <float> d_c = gpu.Allocate <float>(buffSize);

            d_a.CopyFrom(FlatternArr(a), 0, Index1.Zero, buffSize);
            d_b.CopyFrom(FlatternArr(b), 0, Index1.Zero, buffSize);

            //Groups per grid dimension
            int GrPerDim = (int)Math.Ceiling((float)N / groupSize);

            KernelConfig dimension = (
                new Index2(GrPerDim, GrPerDim),                 // Number of groups
                new Index2(groupSize, groupSize));              // Group size (thread count in group)

            sw.Restart();

            matrixMulKernelShared(dimension, d_a.View, d_b.View, d_c.View, N);

            // Wait for the kernel to finish...
            gpu.Synchronize();

            sw.Stop();

            var c = d_c.GetAsArray();

            return(c);
        }