private static void MatrixMulShared(ArrayView <float> a, ArrayView <float> b, ArrayView <float> c, int N) { int index = Grid.GlobalIndex.X; if (index >= c.Length) { return; } int gx = Grid.GlobalIndex.X; int gy = Grid.GlobalIndex.Y; int lx = Group.IdxX; int ly = Group.IdxY; float sum = 0; var sa = SharedMemory.Allocate2D <float>(groupSize, groupSize); var sb = SharedMemory.Allocate2D <float>(groupSize, groupSize); for (int k = 0; k < N; k += groupSize) { sa[lx, ly] = a[gy * N + lx + k]; sb[lx, ly] = b[(ly + k) * N + gx]; Group.Barrier(); for (int r = 0; r < groupSize; r++) { sum += sa[r, ly] * sb[lx, r]; } Group.Barrier(); } c[gy * N + gx] = sum; }
/// <summary> /// The tiled matrix multiplication kernel that runs on the accelerated device. /// </summary> /// <param name="aView">An input matrix of size MxK</param> /// <param name="bView">An input matrix of size KxN</param> /// <param name="cView">An output matrix of size MxN</param> static void MatrixMultiplyTiledKernel( ArrayView2D <float, Stride2D.DenseX> aView, ArrayView2D <float, Stride2D.DenseX> bView, ArrayView2D <float, Stride2D.DenseX> cView) { var global = Grid.GlobalIndex.XY; var x = Group.IdxX; var y = Group.IdxY; var aTile = SharedMemory.Allocate2D <float, Stride2D.DenseX>(new Index2D(TILE_SIZE, TILE_SIZE), new Stride2D.DenseX(TILE_SIZE)); var bTile = SharedMemory.Allocate2D <float, Stride2D.DenseX>(new Index2D(TILE_SIZE, TILE_SIZE), new Stride2D.DenseX(TILE_SIZE)); var sum = 0.0f; for (var i = 0; i < aView.IntExtent.X; i += TILE_SIZE) { if (global.X < aView.IntExtent.X && y + i < aView.IntExtent.Y) { aTile[x, y] = aView[global.X, y + i]; } else { aTile[x, y] = 0; } if (x + i < bView.IntExtent.X && global.Y < bView.IntExtent.Y) { bTile[x, y] = bView[x + i, global.Y]; } else { bTile[x, y] = 0; } Group.Barrier(); for (var k = 0; k < TILE_SIZE; k++) { sum += aTile[new Index2D(x, k)] * bTile[new Index2D(k, y)]; } Group.Barrier(); } if (global.X < cView.IntExtent.X && global.Y < cView.IntExtent.Y) { cView[global] = sum; } }
/// <summary> /// The tiled matrix multiplication kernel that runs on the accelerated device. /// </summary> /// <param name="index">Current matrix index</param> /// <param name="aView">An input matrix of size MxK</param> /// <param name="bView">An input matrix of size KxN</param> /// <param name="cView">An output matrix of size MxN</param> static void MatrixMultiplyTiledKernel(GroupedIndex2 index, ArrayView2D <float> aView, ArrayView2D <float> bView, ArrayView2D <float> cView) { var global = index.ComputeGlobalIndex(); var x = index.GroupIdx.X; var y = index.GroupIdx.Y; var aTile = SharedMemory.Allocate2D <float>(TILE_SIZE, TILE_SIZE); var bTile = SharedMemory.Allocate2D <float>(TILE_SIZE, TILE_SIZE); var sum = 0.0f; for (var i = 0; i < aView.Width; i += TILE_SIZE) { if (global.X < aView.Width && y + i < aView.Height) { aTile[x, y] = aView[global.X, y + i]; } else { aTile[x, y] = 0; } if (x + i < bView.Width && global.Y < bView.Height) { bTile[x, y] = bView[x + i, global.Y]; } else { bTile[x, y] = 0; } Group.Barrier(); for (var k = 0; k < TILE_SIZE; k++) { sum += aTile[new Index2(x, k)] * bTile[new Index2(k, y)]; } Group.Barrier(); } if (global.X < cView.Width && global.Y < cView.Height) { cView[global] = sum; } }