Exemplo n.º 1
0
        //这里先对二维数组进行降维(PACK方法)处理,变成一维数组,效率会提高很多,这里直接计算出A和B的列数输入进去
        private static void KernelPacked(double[] a, double[] b, double[] c, int colsA, int colsB, int colsC)
        {
            //获取当前内核的块的位置
            var blockRow = blockIdx.x;
            var blockCol = blockIdx.y;

            var valueC = 0.0;

            //获取当前块的线程的位置
            var row = threadIdx.x;
            var col = threadIdx.y;

            for (var m = 0; m < DivUp(colsA, BlockSize); ++m)
            {
                var subA = __shared__.Array2D <double>(BlockSize, BlockSize);
                var subB = __shared__.Array2D <double>(BlockSize, BlockSize);

                subA[row, col] = GetMatrixElement(colsA, a, blockRow, m, row, col);
                subB[row, col] = GetMatrixElement(colsB, b, m, blockCol, row, col);
                //同步线程,等所有线程都拿到数据再开始计算
                DeviceFunction.SyncThreads();

                //计算对应的行和列的乘积然后求和
                for (var e = 0; e < BlockSize; ++e)
                {
                    valueC += subA[row, e] * subB[e, col];
                    //valueC = col;
                }
                //同步线程,得出结果
                DeviceFunction.SyncThreads();
            }

            SetMatrixElement(colsC, c, blockRow, blockCol, row, col, valueC);
        }
Exemplo n.º 2
0
        private static void Kernel(long colsA, Func <long, long, T> getA, Func <long, long, T> getB, Action <long, long, T> setC, T zero, Func <T, T, T> add, Func <T, T, T> mul)
        {
            var blockRow = blockIdx.x;
            var blockCol = blockIdx.y;

            var valueC = zero;

            var row = threadIdx.x;
            var col = threadIdx.y;

            for (var m = 0; m < ScalarOps.DivUp(colsA, BlockSize); ++m)
            {
                var subA = __shared__.Array2D <T>(BlockSize, BlockSize);
                var subB = __shared__.Array2D <T>(BlockSize, BlockSize);

                subA[row, col] = getA(blockRow * BlockSize + row, m * BlockSize + col);
                subB[row, col] = getB(m * BlockSize + row, blockCol * BlockSize + col);
                DeviceFunction.SyncThreads();

                for (var e = 0; e < BlockSize; ++e)
                {
                    valueC = add(valueC, mul(subA[row, e], subB[e, col]));
                }
                DeviceFunction.SyncThreads();
            }

            setC(blockRow * BlockSize + row, blockCol * BlockSize + col, valueC);
        }
Exemplo n.º 3
0
        private static void backward_bias_kernel(float[] biasUpdates, float[] delta, int batch, int n, int size)
        {
            var   part = __shared__.Array <float>(CudaUtils.BlockSize);
            int   i, b;
            int   filter = blockIdx.x;
            int   p      = threadIdx.x;
            float sum    = 0;

            for (b = 0; b < batch; ++b)
            {
                for (i = 0; i < size; i += CudaUtils.BlockSize)
                {
                    int index = p + i + size * (filter + n * b);
                    sum += (p + i < size) ? delta[index] : 0;
                }
            }
            part[p] = sum;
            DeviceFunction.SyncThreads();
            if (p == 0)
            {
                for (i = 0; i < CudaUtils.BlockSize; ++i)
                {
                    biasUpdates[filter] += part[i];
                }
            }
        }
Exemplo n.º 4
0
        public static void Kernel(double[] positionsx, double[] positionsy, double[] velocitiesx, double[] velocitiesy, double[] accelerationsx,
                                  double[] accelerationsy, int width, int height, float mousex, float mousey, int[,] squareFish, int[] squaresStart,
                                  int[] fishInSquare, int squaresInRow, int squaresNumber, int[] bitmap)
        {
            int ind = blockIdx.x * blockDim.x + threadIdx.x;

            FishFunctions.AlignWithOtherFish(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind,
                                             squareFish, squaresStart, fishInSquare, squaresInRow, squaresNumber);
            FishFunctions.CohesionWithOtherFish(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind,
                                                squaresStart, squareFish, fishInSquare, squaresNumber, squaresInRow);
            FishFunctions.AvoidOtherFish(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind,
                                         squaresStart, squareFish, fishInSquare, squaresInRow, squaresNumber);
            if (mousex >= 0 && mousey >= 0)
            {
                FishFunctions.AvoidMouse(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind,
                                         mousex, mousey);
            }
            FishFunctions.UpdateFish(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind,
                                     width, height, mousex, mousey);
            FishFunctions.Edges(positionsx, positionsy, ind, width, height);
            DeviceFunction.SyncThreads();
            int col = (255 << 24) + (0 << 16) + (255 << 8) + 255;
            int x   = (int)positionsx[ind];
            int y   = (int)positionsy[ind];

            CircleDrawing.CircleBresenham(x, y, 2, bitmap, width, height, col);
        }
Exemplo n.º 5
0
        public static void Kernel(double[] result, int n, double lb, double ub, double d)
        {
            var temp  = __shared__.Array <double>(BlockSize);
            var start = blockIdx.x * blockDim.x + threadIdx.x;
            var step  = gridDim.x * blockDim.x;

            for (int yi = start; yi < n; yi += step)
            {
                double y      = (yi + 0.5) * d;
                double intSum = 0;
                for (int xi = 0; xi < n; xi++)
                {
                    double x = (xi + 0.5) * d;
                    intSum += CalkaGPU(x, y);
                }
                temp[threadIdx.x] += intSum * d * d;
            }

            DeviceFunction.SyncThreads();

            if (threadIdx.x == 0)
            {
                for (int i = 0; i < BlockSize; i++)
                {
                    result[blockIdx.x] += temp[i];
                }
            }
        }
Exemplo n.º 6
0
        // ReSharper disable once SuggestBaseTypeForParameter
        private static void KernelSequentialReduceIdleThreads <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op)
        {
            var shared = __shared__.ExternArray <T>();

            var tid = threadIdx.x;
            var bid = blockIdx.x;
            var gid = 2 * blockDim.x * bid + tid;

            shared[tid] = (gid < length && gid + blockDim.x < length)
                ? op(array[gid], array[gid + blockDim.x])
                : array[gid];

            DeviceFunction.SyncThreads();

            for (int s = blockDim.x / 2; s > 0; s >>= 1)
            {
                if (tid < s && gid + s < length)
                {
                    shared[tid] = op(shared[tid], shared[tid + s]);
                }

                DeviceFunction.SyncThreads();
            }

            if (tid == 0)
            {
                result[bid] = shared[0];
            }
        }
Exemplo n.º 7
0
        // ReSharper disable once SuggestBaseTypeForParameter
        private static void KernelInterleavedAccess <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op)
        {
            var shared = __shared__.ExternArray <T>();

            var tid = threadIdx.x;
            var bid = blockIdx.x;
            var gid = blockDim.x * bid + tid;

            if (gid < length)
            {
                shared[tid] = array[gid];
            }

            DeviceFunction.SyncThreads();

            for (var s = 1; s < blockDim.x; s *= 2)
            {
                if (tid % (2 * s) == 0 && gid + s < length)
                {
                    shared[tid] = op(shared[tid], shared[tid + s]);
                }

                DeviceFunction.SyncThreads();
            }

            if (tid == 0)
            {
                result[bid] = shared[0];
            }
        }
Exemplo n.º 8
0
        private static void AleaKernelConstants(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            Constant <int> c,
            int n,
            int pitch)
        {
            // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant.
            // Also, we write the results as float2.

            var shared       = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>());
            var coordinatesI = shared.Ptr(0);
            var coordinatesJ = shared.Ptr(c.Value * blockDim.x);

            var bI = blockIdx.y * blockDim.x;
            var bJ = blockIdx.x * blockDim.x;

            for (int k = 0; k != c.Value; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (bJ + threadIdx.x < n)
                {
                    coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x];
                }
            }

            DeviceFunction.SyncThreads();

            var line = threadIdx.x / (blockDim.x / 2);
            var tid  = threadIdx.x % (blockDim.x / 2);

            if (bJ + tid * 2 < n)
            {
                var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>();

                for (int i = line; i < blockDim.x && bI + i < n; i += 2)
                {
                    var dist = default(Real2);

                    for (int k = 0; k != c.Value; ++k)
                    {
                        var coord1 = coordinatesI[k * blockDim.x + i];
                        var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid];
                        var diff   = new Real2(coord1 - coord2.x, coord1 - coord2.y);

                        dist.x += diff.x * diff.x;
                        dist.y += diff.y * diff.y;
                    }

                    var dst = mSquaredDistances.Ptr((bI + i) * pitch + bJ).Reinterpret <Real2>();
                    dst[tid] = dist;
                }
            }
        }
Exemplo n.º 9
0
        private static void AleaKernelFloat2(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            int c,
            int n,
            int pitch)
        {
            // Same as KernelSharedMemory, but one thread does two element in one by using float2 reads.

            var shared       = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>());
            var coordinatesI = shared.Ptr(0);
            var coordinatesJ = shared.Ptr(c * blockDim.x);

            var bI = blockIdx.y * blockDim.x;
            var bJ = blockIdx.x * blockDim.x;

            for (int k = 0; k != c; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (bJ + threadIdx.x < n)
                {
                    coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x];
                }
            }

            DeviceFunction.SyncThreads();

            var line = threadIdx.x / (blockDim.x / 2);
            var tid  = threadIdx.x % (blockDim.x / 2);

            if (bJ + tid * 2 < n)
            {
                var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>();

                for (int i = line; i < blockDim.x && bI + i < n; i += 2)
                {
                    Real dist0 = 0;
                    Real dist1 = 0;

                    for (int k = 0; k != c; ++k)
                    {
                        var coord1 = coordinatesI[k * blockDim.x + i];
                        var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid];
                        var diff   = new Real2(coord1 - coord2.x, coord1 - coord2.y);

                        dist0 += diff.x * diff.x;
                        dist1 += diff.y * diff.y;
                    }

                    mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 0)] = dist0;
                    mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 1)] = dist1;
                }
            }
        }
Exemplo n.º 10
0
        private static void AleaKernelLocalMemory(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            Constant <int> dimX,
            Constant <int> c,
            int n,
            int pitch)
        {
            // Same as KernelConstants, but use both local and shared memory to increase the effective shared memory.

            var coordinatesI = __shared__.Array <Real>(c.Value * dimX.Value);
            var coordinatesJ = __local__.Array <Real2>(c.Value);

            var bI       = blockIdx.y * dimX.Value;
            var bJ       = blockIdx.x * dimX.Value;
            var line     = threadIdx.x / (dimX.Value / 2);
            var tid      = threadIdx.x % (dimX.Value / 2);
            var isActive = bJ + tid * 2 < n;

            for (int k = 0; k != c.Value; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * dimX.Value + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (isActive)
                {
                    var mCoordinates2 = mCoordinates.Reinterpret <Real2>();
                    coordinatesJ[k] = mCoordinates2[(k * n + bJ) / 2 + tid];
                }
            }

            DeviceFunction.SyncThreads();

            if (isActive)
            {
                for (int i = line; i < dimX.Value && bI + i < n; i += 2)
                {
                    var dist = default(Real2);

                    for (int k = 0; k != c.Value; ++k)
                    {
                        var coord1 = coordinatesI[k * dimX.Value + i];
                        var coord2 = coordinatesJ[k];
                        var diff   = new Real2(coord1 - coord2.x, coord1 - coord2.y);

                        dist.x += diff.x * diff.x;
                        dist.y += diff.y * diff.y;
                    }

                    var dst = mSquaredDistances.Reinterpret <Real2>();
                    dst[((bI + i) * pitch + bJ) / 2 + tid] = dist;
                }
            }
        }
Exemplo n.º 11
0
        // ReSharper disable once SuggestBaseTypeForParameter
        private static void KernelSequentialReduceIdleThreadsWarp <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op)
        {
            var shared = __shared__.ExternArray <T>();

            var tid = threadIdx.x;
            var bid = blockIdx.x;
            var bdm = blockDim.x;
            var gid = 2 * bdm * bid + tid;

            shared[tid] = (gid < length && gid + bdm < length)
                ? op(array[gid], array[gid + bdm])
                : array[gid];

            DeviceFunction.SyncThreads();

            for (var s = bdm / 2; s > WarpSize; s >>= 1)
            {
                if (tid < s && gid + s < length)
                {
                    shared[tid] = op(shared[tid], shared[tid + s]);
                }

                DeviceFunction.SyncThreads();
            }

            if (tid < WarpSize)
            {
                if (bdm >= 2 * WarpSize)
                {
                    shared[tid] = op(shared[tid], shared[tid + WarpSize]);
                }

                shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 16));
                shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 8));
                shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 4));
                shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 2));
                shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 1));
            }

            if (tid == 0)
            {
                result[bid] = shared[0];
            }
        }
Exemplo n.º 12
0
        // ReSharper disable once SuggestBaseTypeForParameter
        private static void KernelSequentialReduceIdleThreadsWarpMultiple <T>(deviceptr <T> array, int length, deviceptr <T> result, Func <T, T, T> op)
        {
            var tid = threadIdx.x;
            var bid = blockIdx.x;
            var bdm = blockDim.x;
            var gid = bdm * bid + tid;

            // Todo: 'default(T)' is a bad idea, think of (n * 0) => The accumulator's initial value should be provided by the user!
            var accumulator = default(T);

            while (gid < length)
            {
                accumulator = op(accumulator, array[gid]);
                gid        += gridDim.x * bdm;
            }

            accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 16));
            accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 8));
            accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 4));
            accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 2));
            accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 1));

            var shared = __shared__.Array <T>(8);

            if (tid % WarpSize == 0)
            {
                shared[tid / WarpSize] = accumulator;
            }

            DeviceFunction.SyncThreads();

            if (tid == 0)
            {
                var a = op(op(shared[0], shared[1]), op(shared[2], shared[3]));
                var b = op(op(shared[4], shared[5]), op(shared[6], shared[7]));
                result[bid] = op(a, b);
            }
        }
Exemplo n.º 13
0
Arquivo: SortGpu.cs Projeto: r3db/Sort
        internal static int[] Compute1(int[] array) /*where T : IComparable<T>*/
        {
            var steps = array.Length % 2 == 0
                ? array.Length / 2
                : array.Length / 2 + 1;

            var gpu = Gpu.Default;

            var inputLength = array.Length;
            var inputMemory = gpu.Allocate(array);

            gpu.For(0, array.Length, i =>
            {
                for (var k = 0; k < steps; k++)
                {
                    if (i < inputLength - 1)
                    {
                        var c = inputMemory[i + 0];
                        var n = inputMemory[i + 1];

                        if (i % 2 == 0 && c > n)
                        {
                            Exchange(inputMemory, i, i + 1);
                        }

                        if (i % 2 != 0 && c > n)
                        {
                            Exchange(inputMemory, i, i + 1);
                        }

                        DeviceFunction.SyncThreads();
                    }
                }
            });

            return(Gpu.CopyToHost(inputMemory));
        }
Exemplo n.º 14
0
        //核函数:输入矩阵a和b,返回矩阵c。将输入和输出放在一起的写法
        private static void Kernel(double[,] a, double[,] b, double[,] c)
        {
            var colsA    = a.GetLength(1); //colsA为矩阵A的列数组
            var blockRow = blockIdx.x;     //二维的block行数
            var blockCol = blockIdx.y;     //二维的block列数

            var valueC = 0.0;

            var row = threadIdx.x; //二维的线程行数
            var col = threadIdx.y; //二维的线程列数

            //这里DivUP是向上取整,相当于ceil操作。例如我们有矩阵A有33列,线程数为32,
            //那么我们需要多分配一个block用来计算,因此向上取整
            for (var m = 0; m < DivUp(colsA, BlockSize); ++m)
            {
                //构建两个共享内存中的二维数组
                var subA = __shared__.Array2D <double>(BlockSize, BlockSize);
                var subB = __shared__.Array2D <double>(BlockSize, BlockSize);

                //填充两二维数组
                subA[row, col] = GetMatrixElement(a, blockRow, m, row, col);
                subB[row, col] = GetMatrixElement(b, m, blockCol, row, col);
                //同步线程,等所有线程都拿到数据再开始计算
                DeviceFunction.SyncThreads();

                for (var e = 0; e < BlockSize; ++e)
                {
                    //计算每个线程的值
                    valueC += subA[row, e] * subB[e, col];
                    //valueC = row;
                }
                //同步线程,得出结果
                DeviceFunction.SyncThreads();
            }
            //把计算出来的值赋值给各行各列
            SetMatrixElement(c, blockRow, blockCol, row, col, valueC);
        }
Exemplo n.º 15
0
        private static void AleaKernelSharedMemory(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            int c,
            int n,
            int pitch)
        {
            // We've got shared memory of two vector of K dimensions for B points:
            //
            //      var coordI = __shared__ new Real[k*blockDim.x];
            //      var coordJ = __shared__ new Real[k*blockDim.x];
            //
            // We fill in these two vectors with the coordinates of the I points and J points.
            // Afterwards, the current block will compute the euclidean distances between all
            // the I points and J points, producing a square matrix [B, B].
            //
            // This optimisation means that when producing the square matrix, the I and J points
            // coordinates are only read once.
            //
            // This optimisation works well if K is small enough. Otherwise the shared memory is
            // too small and not enough blocks get schedule per SM.

            var shared       = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>());
            var coordinatesI = shared.Ptr(0);
            var coordinatesJ = shared.Ptr(c * blockDim.x);

            var bI = blockIdx.y * blockDim.x;
            var bJ = blockIdx.x * blockDim.x;

            for (int k = 0; k != c; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (bJ + threadIdx.x < n)
                {
                    coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x];
                }
            }

            DeviceFunction.SyncThreads();

            if (bJ + threadIdx.x < n)
            {
                for (int i = 0; i < blockDim.x && bI + i < n; ++i)
                {
                    Real dist = 0;

                    for (int k = 0; k != c; ++k)
                    {
                        var coord1 = coordinatesI[k * blockDim.x + i];           //mCoordinates[k * x + i];
                        var coord2 = coordinatesJ[k * blockDim.x + threadIdx.x]; //mCoordinates[k * x + j];
                        var diff   = coord1 - coord2;

                        dist += diff * diff;
                    }

                    mSquaredDistances[(bI + i) * pitch + (bJ + threadIdx.x)] = dist;
                }
            }
        }
        public static void Kernel(
            int[] precomputedStateTransitioningMatrixA,
            int[] precomputedStateTransitioningMatrixB,
            bool[] statusOfSynchronization)
        {
            // the status might be YES, NO and DUNNO (aleaGPU enum???)
            // TODO: change this Kernel and computation!
            var n          = problemSize.Value;
            var arrayCount = precomputedStateTransitioningMatrixA.Length / n;
            var power      = 1 << n;

            #region Pointer setup
            var byteOffset = 0;

            var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                       .Ptr(byteOffset / sizeof(ushort))
                       .Volatile();
            byteOffset += n * sizeof(ushort);

            var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                       .Ptr(byteOffset / sizeof(ushort))
                       .Volatile();
            byteOffset += n * sizeof(ushort);
            #endregion

            var acPart  = (arrayCount + gridDim.x - 1) / gridDim.x;
            var acBegin = blockIdx.x * acPart;
            var acEnd   = acBegin + acPart;
            if (arrayCount < acEnd)
            {
                acEnd = arrayCount;
            }
            var index = acBegin * n;
            for (int ac = acBegin; ac < acEnd; ac++, index += n)
            {
                DeviceFunction.SyncThreads();
                if (threadIdx.x == 0)
                {
                    for (int i = 0; i < n; i++)
                    {
                        gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]);
                        gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]);
                    }
                }
                var pathMask = threadIdx.x;
                int vertexAfterTransition;
                var consideringVertex = power - 1;
                DeviceFunction.SyncThreads();
                for (int iter = 0; iter < 9; iter++, pathMask >>= 1)
                {
                    vertexAfterTransition = 0;
                    if ((pathMask & 1) == 0)
                    {
                        for (int i = 0, mask = 1; i < n; i++, mask <<= 1)
                        {
                            if (0 != (mask & consideringVertex))
                            {
                                vertexAfterTransition |= gpuA[i];
                            }
                        }
                    }
                    else
                    {
                        for (int i = 0, mask = 1; i < n; i++, mask <<= 1)
                        {
                            if (0 != (mask & consideringVertex))
                            {
                                vertexAfterTransition |= gpuB[i];
                            }
                        }
                    }
                    consideringVertex = vertexAfterTransition;
                }
                var singleVertex = DeviceFunction.Any(0 == (consideringVertex & (consideringVertex - 1)));
                if (singleVertex && threadIdx.x % DeviceFunction.WarpSize == 0)
                {
                    statusOfSynchronization[ac] = true;
                }
            }
        }
        public static void Kernel(
            int[] precomputedStateTransitioningMatrixA,
            int[] precomputedStateTransitioningMatrixB,
            bool[] isSynchronizing,
            int[] shortestSynchronizingWordLength)
        {
            var       n            = problemSize.Value;
            var       arrayCount   = precomputedStateTransitioningMatrixA.Length / n;
            var       power        = 1 << n;
            const int bitSize      = 6;
            int       twoToBitsize = (1 << bitSize) - 1;
            var       wordCount    = (8 * sizeof(int) / bitSize);

            #region Pointer setup
            var byteOffset = 0;

            var queueEvenCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                 .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);

            var readingQueueIndex = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                    .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);

            var queueOddCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);


            var gpuAB = DeviceFunction.AddressOfArray(__shared__.ExternArray <uint>())
                        .Ptr(byteOffset / sizeof(uint))
                        .Volatile();
            byteOffset += (n + 1) * sizeof(uint);

            // must be the last among ints
            var isDiscoveredPtr = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                  .Ptr(byteOffset / sizeof(int));
            var complexOffset = (power * sizeof(int) + wordCount - 1) / wordCount;
            byteOffset += complexOffset + (((complexOffset % sizeof(int)) & 1) == 1 ? 1 : 0);

            var queueEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                            .Ptr(byteOffset / sizeof(ushort))
                            .Volatile();
            byteOffset += (power / 2 + 1) * sizeof(ushort);

            var queueOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                           .Ptr(byteOffset / sizeof(ushort))
                           .Volatile();
            byteOffset += (power / 2 + 1) * sizeof(ushort);

            var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                             .Ptr(byteOffset / sizeof(bool))
                             .Volatile();
            byteOffset += sizeof(bool);
            #endregion

            ushort nextDistance;
            ushort vertexAfterTransitionA,
                   vertexAfterTransitionB;
            uint vertexAfterTransition;
            var  acPart  = (arrayCount + gridDim.x - 1) / gridDim.x;
            var  acBegin = blockIdx.x * acPart;
            var  acEnd   = acBegin + acPart;
            if (arrayCount < acEnd)
            {
                acEnd = arrayCount;
            }
            var index = acBegin * n;
            var maskN = (1 << n) - 1;

            for (int ac = acBegin; ac < acEnd; ac++, index += n)
            {
                // cleanup
                for (int consideringVertex = threadIdx.x, endingVertex = (power - 1) / wordCount;
                     consideringVertex < endingVertex;
                     consideringVertex += blockDim.x)
                {
                    isDiscoveredPtr[consideringVertex] = 0;
                }

                if (threadIdx.x < n)
                {
                    gpuAB[threadIdx.x + 1] = (uint)(
                        (1 << (n + precomputedStateTransitioningMatrixA[index + threadIdx.x]))
                        | (1 << precomputedStateTransitioningMatrixB[index + threadIdx.x])
                        );
                }
                else if (threadIdx.x == n)
                {
                    gpuAB[0]             = 0;
                    readingQueueIndex[0] = 0;
                    shouldStop[0]        = false;
                    queueEvenCount[0]    = 0;
                    queueOddCount[0]     = 1;
                    queueOdd[0]          = (ushort)(power - 1);
                    // assuming n >= 2
                    isDiscoveredPtr[(power - 1) / wordCount] = 1 << (((power - 1) % wordCount) * bitSize);
                }

                var readingQueue      = queueOdd;
                var writingQueue      = queueEven;
                var readingQueueCount = queueOddCount;
                var writingQueueIndex = queueEvenCount;

                nextDistance = 1;
                int readingQueueCountCached = 1;
                DeviceFunction.SyncThreads();

                while (readingQueueCountCached > 0 && !shouldStop[0])
                {
                    //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac);
                    while (readingQueueIndex[0] < readingQueueCountCached)
                    {
                        var iter = DeviceFunction.AtomicAdd(readingQueueIndex, 1);
                        if (iter >= readingQueueCountCached)
                        {
                            break;
                        }

                        int consideringVertex = readingQueue[iter];

                        vertexAfterTransition = 0;
                        for (int i = 1; i <= n; i++, consideringVertex >>= 1)
                        {
                            vertexAfterTransition |= gpuAB[i * (1 & consideringVertex)];
                        }

                        vertexAfterTransitionA = (ushort)(vertexAfterTransition >> n);
                        vertexAfterTransitionB = (ushort)(vertexAfterTransition & maskN);

                        var isDiscoveredOffset = (vertexAfterTransitionA % wordCount) * bitSize;

                        if (0 == (isDiscoveredPtr[vertexAfterTransitionA / wordCount] & (twoToBitsize << isDiscoveredOffset)))
                        {
                            // should have used AtomicOr (which is not available in AleaGPU - very unfortunate)
                            var beforeAdded = DeviceFunction.AtomicAdd(
                                isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount),
                                1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset);

                            if (0 == beforeAdded)
                            {
                                if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1)))
                                {
                                    shortestSynchronizingWordLength[ac] = nextDistance;
                                    isSynchronizing[ac] = true;
                                    shouldStop[0]       = true;
                                    break;
                                }

                                writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)]
                                    = (ushort)vertexAfterTransitionA;
                            }
                            else
                            {
                                DeviceFunction.AtomicSub(
                                    isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount),
                                    1 << isDiscoveredOffset);
                            }
                        }

                        isDiscoveredOffset = (vertexAfterTransitionB % wordCount) * bitSize;
                        if (0 == (isDiscoveredPtr[vertexAfterTransitionB / wordCount] & (twoToBitsize << isDiscoveredOffset)))
                        {
                            var beforeAdded = DeviceFunction.AtomicAdd(
                                isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount),
                                1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset);

                            if (0 == beforeAdded)
                            {
                                if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1)))
                                {
                                    shortestSynchronizingWordLength[ac] = nextDistance;
                                    isSynchronizing[ac] = true;
                                    shouldStop[0]       = true;
                                    break;
                                }

                                writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)]
                                    = (ushort)vertexAfterTransitionB;
                            }
                            else
                            {
                                DeviceFunction.AtomicSub(
                                    isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount),
                                    1 << isDiscoveredOffset);
                            }
                        }
                    }
                    DeviceFunction.SyncThreads();
                    ++nextDistance;
                    readingQueue            = nextDistance % 2 == 0 ? queueEven : queueOdd;
                    writingQueue            = nextDistance % 2 != 0 ? queueEven : queueOdd;
                    readingQueueCount       = nextDistance % 2 == 0 ? queueEvenCount : queueOddCount;
                    writingQueueIndex       = nextDistance % 2 != 0 ? queueEvenCount : queueOddCount;
                    readingQueueCountCached = nextDistance % 2 == 0 ? queueEvenCount[0] : queueOddCount[0];
                    if (threadIdx.x == 0)
                    {
                        writingQueueIndex[0] = 0;
                        readingQueueIndex[0] = 0;
                    }
                    DeviceFunction.SyncThreads();
                }
            }
        }
Exemplo n.º 18
0
        public static void Kernel(
            int[] precomputedStateTransitioningMatrixA,
            int[] precomputedStateTransitioningMatrixB,
            bool[] isSynchronizing,
            int[] shortestSynchronizingWordLength)
        {
            var n          = problemSize.Value;
            var arrayCount = precomputedStateTransitioningMatrixA.Length / n;
            var power      = 1 << n;

            #region Pointer setup
            var byteOffset = 0;

            var minEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                          .Ptr(byteOffset / sizeof(int));
            byteOffset += blockDim.x * sizeof(int);

            var minOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                         .Ptr(byteOffset / sizeof(int));
            byteOffset += blockDim.x * sizeof(int);

            var maxEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                          .Ptr(byteOffset / sizeof(int));
            byteOffset += blockDim.x * sizeof(int);

            var maxOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                         .Ptr(byteOffset / sizeof(int));
            byteOffset += blockDim.x * sizeof(int);

            var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                       .Ptr(byteOffset / sizeof(ushort))
                       .Volatile();
            byteOffset += n * sizeof(ushort);

            var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                       .Ptr(byteOffset / sizeof(ushort))
                       .Volatile();
            byteOffset += n * sizeof(ushort);

            var isActiveEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                               .Ptr(byteOffset / sizeof(bool));
            byteOffset += power * sizeof(bool);

            var isActiveOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                              .Ptr(byteOffset / sizeof(bool));
            byteOffset += power * sizeof(bool);

            var isDiscovered = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                               .Ptr(byteOffset / sizeof(bool));
            byteOffset += power * sizeof(bool);

            var addedAnythingOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                                   .Ptr(byteOffset / sizeof(bool))
                                   .Volatile();
            byteOffset += sizeof(bool);

            var addedAnythingEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                                    .Ptr(byteOffset / sizeof(bool))
                                    .Volatile();
            byteOffset += sizeof(bool);

            var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                             .Ptr(byteOffset / sizeof(bool))
                             .Volatile();
            byteOffset += sizeof(bool);
            #endregion


            ushort nextDistance;
            int    vertexAfterTransitionA,
                   vertexAfterTransitionB,
                   index;
            var acPart  = (arrayCount + gridDim.x - 1) / gridDim.x;
            var acBegin = blockIdx.x * acPart;
            var acEnd   = acBegin + acPart;
            if (arrayCount < acEnd)
            {
                acEnd = arrayCount;
            }
            index = acBegin * n;
            var threadWork = (power + blockDim.x - 1) / blockDim.x;
            DeviceFunction.SyncThreads();
            for (int ac = acBegin; ac < acEnd; ac++, index += n)
            {
                //Console.WriteLine("Begin {0}", threadIdx.x);
                // cleanup
                var readingActive        = isActiveOdd;
                var writingActive        = isActiveEven;
                var readingAnythingAdded = addedAnythingOdd;
                var writingAnythingAdded = addedAnythingEven;
                var minRead  = minOdd;
                var minWrite = minEven;
                var maxRead  = maxOdd;
                var maxWrite = maxEven;

                if (threadIdx.x == DeviceFunction.WarpSize || (threadIdx.x == 0 && blockDim.x <= DeviceFunction.WarpSize))
                {
                    for (int i = 0; i < n; i++)
                    {
                        gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]);
                        gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]);
                    }
                }

                minRead[threadIdx.x]  = int.MaxValue;
                maxRead[threadIdx.x]  = 0;
                minWrite[threadIdx.x] = int.MaxValue;
                maxWrite[threadIdx.x] = 0;
                if (threadIdx.x == 0)
                {
                    shouldStop[0]           = false;
                    readingAnythingAdded[0] = true;
                }
                else if (threadIdx.x == blockDim.x - 1)
                {
                    minRead[blockDim.x - 1] = (power - 1) % threadWork;
                    maxRead[blockDim.x - 1] = power % threadWork;
                }
                nextDistance = 1;
                for (int consideringVertex = threadIdx.x, endingVertex = power, powerm1 = power - 1;
                     consideringVertex < endingVertex;
                     consideringVertex += blockDim.x)
                {
                    isActiveEven[consideringVertex] = false;
                    isActiveOdd[consideringVertex]  = consideringVertex == powerm1;
                    isDiscovered[consideringVertex] = consideringVertex == powerm1;
                    //Console.WriteLine("cleaning {0}, {1} {2} {3}", consideringVertex,
                    //    readingActive[consideringVertex], writingActive[consideringVertex], isDiscovered[consideringVertex]);
                }
                DeviceFunction.SyncThreads();

                while (readingAnythingAdded[0] && !shouldStop[0])
                {
                    if (threadIdx.x == 0)
                    {
                        readingAnythingAdded[0] = false;
                        writingAnythingAdded[0] = false;
                        //Console.WriteLine("distance {0}", nextDistance);
                    }

                    int myPart           = (power + blockDim.x - 1) / blockDim.x;
                    int beginningPointer = threadIdx.x * myPart;
                    int endingPointer    = beginningPointer + myPart;
                    if (power < endingPointer)
                    {
                        endingPointer = power;
                    }
                    if (minRead[threadIdx.x] > beginningPointer)
                    {
                        beginningPointer = minRead[threadIdx.x];
                    }
                    //if (maxRead[threadIdx.x] < endingPointer)
                    //    endingPointer = maxRead[threadIdx.x];
                    minRead[threadIdx.x]  = int.MaxValue;
                    maxRead[threadIdx.x]  = 0;
                    minWrite[threadIdx.x] = int.MaxValue;
                    maxWrite[threadIdx.x] = 0;
                    DeviceFunction.SyncThreads();
                    //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac);
                    for (int consideringVertex = beginningPointer; consideringVertex < endingPointer; ++consideringVertex)
                    {
                        //Console.WriteLine("writeIsActive[{0}]=={1}", consideringVertex, writingActive[consideringVertex]);
                        if (!readingActive[consideringVertex])
                        {
                            //Console.WriteLine("Skipping {0}", consideringVertex);
                            continue;
                        }
                        else
                        {
                            //Console.WriteLine("Considering {0} dist {1}", consideringVertex, nextDistance);
                            readingActive[consideringVertex] = false;
                        }

                        vertexAfterTransitionA = vertexAfterTransitionB = 0;
                        for (int i = 0, mask = 1; i < n; i++, mask <<= 1)
                        {
                            if (0 != (mask & consideringVertex))
                            {
                                vertexAfterTransitionA |= gpuA[i];
                                vertexAfterTransitionB |= gpuB[i];
                            }
                        }

                        if (!isDiscovered[vertexAfterTransitionA])
                        {
                            isDiscovered[vertexAfterTransitionA] = true;
                            //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionA, consideringVertex);
                            DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionA / threadWork), vertexAfterTransitionA % threadWork);
                            DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionA / threadWork), 1 + (vertexAfterTransitionA % threadWork));

                            if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1)))
                            {
                                shortestSynchronizingWordLength[ac] = nextDistance;
                                isSynchronizing[ac] = true;
                                shouldStop[0]       = true;
                                break;
                            }

                            writingActive[vertexAfterTransitionA] = true;
                            writingAnythingAdded[0] = true;
                        }

                        if (!isDiscovered[vertexAfterTransitionB])
                        {
                            isDiscovered[vertexAfterTransitionB] = true;
                            //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionB, consideringVertex);
                            DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionB / threadWork), vertexAfterTransitionB % threadWork);
                            DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionB / threadWork), 1 + (vertexAfterTransitionB % threadWork));

                            if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1)))
                            {
                                shortestSynchronizingWordLength[ac] = nextDistance;
                                isSynchronizing[ac] = true;
                                shouldStop[0]       = true;
                                break;
                            }

                            writingActive[vertexAfterTransitionB] = true;
                            writingAnythingAdded[0] = true;
                        }
                    }
                    ++nextDistance;
                    readingActive        = nextDistance % 2 == 0 ? isActiveEven : isActiveOdd;
                    writingActive        = nextDistance % 2 != 0 ? isActiveEven : isActiveOdd;
                    readingAnythingAdded = nextDistance % 2 == 0 ? addedAnythingEven : addedAnythingOdd;
                    writingAnythingAdded = nextDistance % 2 != 0 ? addedAnythingEven : addedAnythingOdd;
                    minRead  = nextDistance % 2 == 0 ? minEven : minOdd;
                    minWrite = nextDistance % 2 != 0 ? minEven : minOdd;
                    maxRead  = nextDistance % 2 == 0 ? maxEven : maxOdd;
                    maxWrite = nextDistance % 2 != 0 ? maxEven : maxOdd;
                    DeviceFunction.SyncThreads();
                }
            }
        }