private static void Kernel(long colsA, Func <long, long, T> getA, Func <long, long, T> getB, Action <long, long, T> setC, T zero, Func <T, T, T> add, Func <T, T, T> mul) { var blockRow = blockIdx.x; var blockCol = blockIdx.y; var valueC = zero; var row = threadIdx.x; var col = threadIdx.y; for (var m = 0; m < ScalarOps.DivUp(colsA, BlockSize); ++m) { var subA = __shared__.Array2D <T>(BlockSize, BlockSize); var subB = __shared__.Array2D <T>(BlockSize, BlockSize); subA[row, col] = getA(blockRow * BlockSize + row, m * BlockSize + col); subB[row, col] = getB(m * BlockSize + row, blockCol * BlockSize + col); DeviceFunction.SyncThreads(); for (var e = 0; e < BlockSize; ++e) { valueC = add(valueC, mul(subA[row, e], subB[e, col])); } DeviceFunction.SyncThreads(); } setC(blockRow * BlockSize + row, blockCol * BlockSize + col, valueC); }