Ejemplo n.º 1
0
        private static void Kernel(long colsA, Func <long, long, T> getA, Func <long, long, T> getB, Action <long, long, T> setC, T zero, Func <T, T, T> add, Func <T, T, T> mul)
        {
            var blockRow = blockIdx.x;
            var blockCol = blockIdx.y;

            var valueC = zero;

            var row = threadIdx.x;
            var col = threadIdx.y;

            for (var m = 0; m < ScalarOps.DivUp(colsA, BlockSize); ++m)
            {
                var subA = __shared__.Array2D <T>(BlockSize, BlockSize);
                var subB = __shared__.Array2D <T>(BlockSize, BlockSize);

                subA[row, col] = getA(blockRow * BlockSize + row, m * BlockSize + col);
                subB[row, col] = getB(m * BlockSize + row, blockCol * BlockSize + col);
                DeviceFunction.SyncThreads();

                for (var e = 0; e < BlockSize; ++e)
                {
                    valueC = add(valueC, mul(subA[row, e], subB[e, col]));
                }
                DeviceFunction.SyncThreads();
            }

            setC(blockRow * BlockSize + row, blockCol * BlockSize + col, valueC);
        }