Exemplo n.º 1
0
        private static void AleaKernelFloat2(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            int c,
            int n,
            int pitch)
        {
            // Same as KernelSharedMemory, but one thread does two element in one by using float2 reads.

            var shared       = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>());
            var coordinatesI = shared.Ptr(0);
            var coordinatesJ = shared.Ptr(c * blockDim.x);

            var bI = blockIdx.y * blockDim.x;
            var bJ = blockIdx.x * blockDim.x;

            for (int k = 0; k != c; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (bJ + threadIdx.x < n)
                {
                    coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x];
                }
            }

            DeviceFunction.SyncThreads();

            var line = threadIdx.x / (blockDim.x / 2);
            var tid  = threadIdx.x % (blockDim.x / 2);

            if (bJ + tid * 2 < n)
            {
                var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>();

                for (int i = line; i < blockDim.x && bI + i < n; i += 2)
                {
                    Real dist0 = 0;
                    Real dist1 = 0;

                    for (int k = 0; k != c; ++k)
                    {
                        var coord1 = coordinatesI[k * blockDim.x + i];
                        var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid];
                        var diff   = new Real2(coord1 - coord2.x, coord1 - coord2.y);

                        dist0 += diff.x * diff.x;
                        dist1 += diff.y * diff.y;
                    }

                    mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 0)] = dist0;
                    mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 1)] = dist1;
                }
            }
        }
Exemplo n.º 2
0
        private static void AleaKernelConstants(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            Constant <int> c,
            int n,
            int pitch)
        {
            // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant.
            // Also, we write the results as float2.

            var shared       = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>());
            var coordinatesI = shared.Ptr(0);
            var coordinatesJ = shared.Ptr(c.Value * blockDim.x);

            var bI = blockIdx.y * blockDim.x;
            var bJ = blockIdx.x * blockDim.x;

            for (int k = 0; k != c.Value; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (bJ + threadIdx.x < n)
                {
                    coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x];
                }
            }

            DeviceFunction.SyncThreads();

            var line = threadIdx.x / (blockDim.x / 2);
            var tid  = threadIdx.x % (blockDim.x / 2);

            if (bJ + tid * 2 < n)
            {
                var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>();

                for (int i = line; i < blockDim.x && bI + i < n; i += 2)
                {
                    var dist = default(Real2);

                    for (int k = 0; k != c.Value; ++k)
                    {
                        var coord1 = coordinatesI[k * blockDim.x + i];
                        var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid];
                        var diff   = new Real2(coord1 - coord2.x, coord1 - coord2.y);

                        dist.x += diff.x * diff.x;
                        dist.y += diff.y * diff.y;
                    }

                    var dst = mSquaredDistances.Ptr((bI + i) * pitch + bJ).Reinterpret <Real2>();
                    dst[tid] = dist;
                }
            }
        }
Exemplo n.º 3
0
        private static void AleaKernelSharedMemory(
            deviceptr <Real> mSquaredDistances,
            deviceptr <Real> mCoordinates,
            int c,
            int n,
            int pitch)
        {
            // We've got shared memory of two vector of K dimensions for B points:
            //
            //      var coordI = __shared__ new Real[k*blockDim.x];
            //      var coordJ = __shared__ new Real[k*blockDim.x];
            //
            // We fill in these two vectors with the coordinates of the I points and J points.
            // Afterwards, the current block will compute the euclidean distances between all
            // the I points and J points, producing a square matrix [B, B].
            //
            // This optimisation means that when producing the square matrix, the I and J points
            // coordinates are only read once.
            //
            // This optimisation works well if K is small enough. Otherwise the shared memory is
            // too small and not enough blocks get schedule per SM.

            var shared       = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>());
            var coordinatesI = shared.Ptr(0);
            var coordinatesJ = shared.Ptr(c * blockDim.x);

            var bI = blockIdx.y * blockDim.x;
            var bJ = blockIdx.x * blockDim.x;

            for (int k = 0; k != c; ++k)
            {
                if (bI + threadIdx.x < n)
                {
                    coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x];
                }

                if (bJ + threadIdx.x < n)
                {
                    coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x];
                }
            }

            DeviceFunction.SyncThreads();

            if (bJ + threadIdx.x < n)
            {
                for (int i = 0; i < blockDim.x && bI + i < n; ++i)
                {
                    Real dist = 0;

                    for (int k = 0; k != c; ++k)
                    {
                        var coord1 = coordinatesI[k * blockDim.x + i];           //mCoordinates[k * x + i];
                        var coord2 = coordinatesJ[k * blockDim.x + threadIdx.x]; //mCoordinates[k * x + j];
                        var diff   = coord1 - coord2;

                        dist += diff * diff;
                    }

                    mSquaredDistances[(bI + i) * pitch + (bJ + threadIdx.x)] = dist;
                }
            }
        }
        public static void Kernel(
            int[] precomputedStateTransitioningMatrixA,
            int[] precomputedStateTransitioningMatrixB,
            bool[] statusOfSynchronization)
        {
            // the status might be YES, NO and DUNNO (aleaGPU enum???)
            // TODO: change this Kernel and computation!
            var n          = problemSize.Value;
            var arrayCount = precomputedStateTransitioningMatrixA.Length / n;
            var power      = 1 << n;

            #region Pointer setup
            var byteOffset = 0;

            var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                       .Ptr(byteOffset / sizeof(ushort))
                       .Volatile();
            byteOffset += n * sizeof(ushort);

            var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                       .Ptr(byteOffset / sizeof(ushort))
                       .Volatile();
            byteOffset += n * sizeof(ushort);
            #endregion

            var acPart  = (arrayCount + gridDim.x - 1) / gridDim.x;
            var acBegin = blockIdx.x * acPart;
            var acEnd   = acBegin + acPart;
            if (arrayCount < acEnd)
            {
                acEnd = arrayCount;
            }
            var index = acBegin * n;
            for (int ac = acBegin; ac < acEnd; ac++, index += n)
            {
                DeviceFunction.SyncThreads();
                if (threadIdx.x == 0)
                {
                    for (int i = 0; i < n; i++)
                    {
                        gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]);
                        gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]);
                    }
                }
                var pathMask = threadIdx.x;
                int vertexAfterTransition;
                var consideringVertex = power - 1;
                DeviceFunction.SyncThreads();
                for (int iter = 0; iter < 9; iter++, pathMask >>= 1)
                {
                    vertexAfterTransition = 0;
                    if ((pathMask & 1) == 0)
                    {
                        for (int i = 0, mask = 1; i < n; i++, mask <<= 1)
                        {
                            if (0 != (mask & consideringVertex))
                            {
                                vertexAfterTransition |= gpuA[i];
                            }
                        }
                    }
                    else
                    {
                        for (int i = 0, mask = 1; i < n; i++, mask <<= 1)
                        {
                            if (0 != (mask & consideringVertex))
                            {
                                vertexAfterTransition |= gpuB[i];
                            }
                        }
                    }
                    consideringVertex = vertexAfterTransition;
                }
                var singleVertex = DeviceFunction.Any(0 == (consideringVertex & (consideringVertex - 1)));
                if (singleVertex && threadIdx.x % DeviceFunction.WarpSize == 0)
                {
                    statusOfSynchronization[ac] = true;
                }
            }
        }
        public static void Kernel(
            int[] precomputedStateTransitioningMatrixA,
            int[] precomputedStateTransitioningMatrixB,
            bool[] isSynchronizing,
            int[] shortestSynchronizingWordLength)
        {
            var       n            = problemSize.Value;
            var       arrayCount   = precomputedStateTransitioningMatrixA.Length / n;
            var       power        = 1 << n;
            const int bitSize      = 6;
            int       twoToBitsize = (1 << bitSize) - 1;
            var       wordCount    = (8 * sizeof(int) / bitSize);

            #region Pointer setup
            var byteOffset = 0;

            var queueEvenCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                 .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);

            var readingQueueIndex = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                    .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);

            var queueOddCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);


            var gpuAB = DeviceFunction.AddressOfArray(__shared__.ExternArray <uint>())
                        .Ptr(byteOffset / sizeof(uint))
                        .Volatile();
            byteOffset += (n + 1) * sizeof(uint);

            // must be the last among ints
            var isDiscoveredPtr = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                  .Ptr(byteOffset / sizeof(int));
            var complexOffset = (power * sizeof(int) + wordCount - 1) / wordCount;
            byteOffset += complexOffset + (((complexOffset % sizeof(int)) & 1) == 1 ? 1 : 0);

            var queueEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                            .Ptr(byteOffset / sizeof(ushort))
                            .Volatile();
            byteOffset += (power / 2 + 1) * sizeof(ushort);

            var queueOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                           .Ptr(byteOffset / sizeof(ushort))
                           .Volatile();
            byteOffset += (power / 2 + 1) * sizeof(ushort);

            var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                             .Ptr(byteOffset / sizeof(bool))
                             .Volatile();
            byteOffset += sizeof(bool);
            #endregion

            ushort nextDistance;
            ushort vertexAfterTransitionA,
                   vertexAfterTransitionB;
            uint vertexAfterTransition;
            var  acPart  = (arrayCount + gridDim.x - 1) / gridDim.x;
            var  acBegin = blockIdx.x * acPart;
            var  acEnd   = acBegin + acPart;
            if (arrayCount < acEnd)
            {
                acEnd = arrayCount;
            }
            var index = acBegin * n;
            var maskN = (1 << n) - 1;

            for (int ac = acBegin; ac < acEnd; ac++, index += n)
            {
                // cleanup
                for (int consideringVertex = threadIdx.x, endingVertex = (power - 1) / wordCount;
                     consideringVertex < endingVertex;
                     consideringVertex += blockDim.x)
                {
                    isDiscoveredPtr[consideringVertex] = 0;
                }

                if (threadIdx.x < n)
                {
                    gpuAB[threadIdx.x + 1] = (uint)(
                        (1 << (n + precomputedStateTransitioningMatrixA[index + threadIdx.x]))
                        | (1 << precomputedStateTransitioningMatrixB[index + threadIdx.x])
                        );
                }
                else if (threadIdx.x == n)
                {
                    gpuAB[0]             = 0;
                    readingQueueIndex[0] = 0;
                    shouldStop[0]        = false;
                    queueEvenCount[0]    = 0;
                    queueOddCount[0]     = 1;
                    queueOdd[0]          = (ushort)(power - 1);
                    // assuming n >= 2
                    isDiscoveredPtr[(power - 1) / wordCount] = 1 << (((power - 1) % wordCount) * bitSize);
                }

                var readingQueue      = queueOdd;
                var writingQueue      = queueEven;
                var readingQueueCount = queueOddCount;
                var writingQueueIndex = queueEvenCount;

                nextDistance = 1;
                int readingQueueCountCached = 1;
                DeviceFunction.SyncThreads();

                while (readingQueueCountCached > 0 && !shouldStop[0])
                {
                    //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac);
                    while (readingQueueIndex[0] < readingQueueCountCached)
                    {
                        var iter = DeviceFunction.AtomicAdd(readingQueueIndex, 1);
                        if (iter >= readingQueueCountCached)
                        {
                            break;
                        }

                        int consideringVertex = readingQueue[iter];

                        vertexAfterTransition = 0;
                        for (int i = 1; i <= n; i++, consideringVertex >>= 1)
                        {
                            vertexAfterTransition |= gpuAB[i * (1 & consideringVertex)];
                        }

                        vertexAfterTransitionA = (ushort)(vertexAfterTransition >> n);
                        vertexAfterTransitionB = (ushort)(vertexAfterTransition & maskN);

                        var isDiscoveredOffset = (vertexAfterTransitionA % wordCount) * bitSize;

                        if (0 == (isDiscoveredPtr[vertexAfterTransitionA / wordCount] & (twoToBitsize << isDiscoveredOffset)))
                        {
                            // should have used AtomicOr (which is not available in AleaGPU - very unfortunate)
                            var beforeAdded = DeviceFunction.AtomicAdd(
                                isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount),
                                1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset);

                            if (0 == beforeAdded)
                            {
                                if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1)))
                                {
                                    shortestSynchronizingWordLength[ac] = nextDistance;
                                    isSynchronizing[ac] = true;
                                    shouldStop[0]       = true;
                                    break;
                                }

                                writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)]
                                    = (ushort)vertexAfterTransitionA;
                            }
                            else
                            {
                                DeviceFunction.AtomicSub(
                                    isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount),
                                    1 << isDiscoveredOffset);
                            }
                        }

                        isDiscoveredOffset = (vertexAfterTransitionB % wordCount) * bitSize;
                        if (0 == (isDiscoveredPtr[vertexAfterTransitionB / wordCount] & (twoToBitsize << isDiscoveredOffset)))
                        {
                            var beforeAdded = DeviceFunction.AtomicAdd(
                                isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount),
                                1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset);

                            if (0 == beforeAdded)
                            {
                                if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1)))
                                {
                                    shortestSynchronizingWordLength[ac] = nextDistance;
                                    isSynchronizing[ac] = true;
                                    shouldStop[0]       = true;
                                    break;
                                }

                                writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)]
                                    = (ushort)vertexAfterTransitionB;
                            }
                            else
                            {
                                DeviceFunction.AtomicSub(
                                    isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount),
                                    1 << isDiscoveredOffset);
                            }
                        }
                    }
                    DeviceFunction.SyncThreads();
                    ++nextDistance;
                    readingQueue            = nextDistance % 2 == 0 ? queueEven : queueOdd;
                    writingQueue            = nextDistance % 2 != 0 ? queueEven : queueOdd;
                    readingQueueCount       = nextDistance % 2 == 0 ? queueEvenCount : queueOddCount;
                    writingQueueIndex       = nextDistance % 2 != 0 ? queueEvenCount : queueOddCount;
                    readingQueueCountCached = nextDistance % 2 == 0 ? queueEvenCount[0] : queueOddCount[0];
                    if (threadIdx.x == 0)
                    {
                        writingQueueIndex[0] = 0;
                        readingQueueIndex[0] = 0;
                    }
                    DeviceFunction.SyncThreads();
                }
            }
        }
        public static void Kernel(
            int[] precomputedStateTransitioningMatrixA,
            int[] precomputedStateTransitioningMatrixB,
            bool[] isSynchronizing,
            int[] shortestSynchronizingWordLength)
        {
            if (threadIdx.x % DeviceFunction.WarpSize != 0)
            {
                return;
            }

            var n               = problemSize.Value;
            var arrayCount      = precomputedStateTransitioningMatrixA.Length / n;
            var powerSetCount   = 1 << n;
            var queueUpperBound = powerSetCount / 2 + 1;
            var initialVertex   = (ushort)(powerSetCount - 1);
            var warpCount       = blockDim.x / DeviceFunction.WarpSize;

            var threadOffset = threadIdx.x / DeviceFunction.WarpSize;

            #region Pointer setup
            var byteOffset = 0;

            var queueItself = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                              .Ptr(byteOffset / sizeof(ushort))
                              .Ptr(threadOffset * queueUpperBound);
            byteOffset += sizeof(ushort) * queueUpperBound * warpCount;

            var gpuAs = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                        .Ptr(byteOffset / sizeof(ushort))
                        .Ptr(threadOffset * n)
                        .Volatile();
            byteOffset += sizeof(ushort) * n * warpCount;

            var gpuBs = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                        .Ptr(byteOffset / sizeof(ushort))
                        .Ptr(threadOffset * n)
                        .Volatile();
            byteOffset += sizeof(ushort) * n * warpCount;

            var isDiscovered = DeviceFunction.AddressOfArray(__shared__.ExternArray <byte>())
                               .Ptr(byteOffset / sizeof(byte))
                               .Ptr(threadOffset * powerSetCount);
            byteOffset += sizeof(byte) * powerSetCount * warpCount;
            #endregion
            int index;
            var acPart  = (arrayCount + gridDim.x - 1) / gridDim.x;
            var acBegin = blockIdx.x * acPart;
            var acEnd   = acBegin + acPart;
            if (arrayCount < acEnd)
            {
                acEnd = arrayCount;
            }
            index = acBegin * n;
            int  queueReadingIndexModQueueSize = 0, queueWritingIndexModQueueSize = 0;
            int  queueReadingIndexTotals = 0, queueWritingIndexTotals = 0;
            byte localCyclicProblemId = 1;
            for (int ac = acBegin; ac < acEnd; ac++, index += n, localCyclicProblemId++)
            {
                queueReadingIndexTotals       = queueWritingIndexTotals = 0;
                queueReadingIndexModQueueSize = queueWritingIndexModQueueSize = 0;

                if (localCyclicProblemId == 0)
                {
                    for (int i = 0; i < powerSetCount; i++)
                    {
                        isDiscovered[i] = 0;
                    }

                    localCyclicProblemId = 1;
                }
                for (int i = 0; i < n; i++)
                {
                    gpuAs[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]);
                    gpuBs[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]);
                }

                // the queue is surely at least 2 vertices long...
                // no need for modulo operations
                queueItself[queueWritingIndexModQueueSize++] = initialVertex;

                var    discoveredSingleton = false;
                ushort consideringVertex;
                ushort vertexAfterTransitionA;
                ushort vertexAfterTransitionB;
                ushort firstSingletonDistance = 0;

                ushort currentNextDistance = 1;
                var    verticesUntilBump   = int.MaxValue;
                var    seekingFirstNext    = true;

                // there is something to read
                while (queueWritingIndexModQueueSize > queueReadingIndexModQueueSize || queueWritingIndexTotals > queueReadingIndexTotals)
                {
                    consideringVertex = queueItself[queueReadingIndexModQueueSize++];
                    if (queueReadingIndexModQueueSize == queueUpperBound)
                    {
                        queueReadingIndexTotals++;
                        queueReadingIndexModQueueSize = 0;
                    }

                    if (--verticesUntilBump == 0)
                    {
                        ++currentNextDistance;
                        seekingFirstNext = true;
                    }

                    vertexAfterTransitionA = vertexAfterTransitionB = 0;

                    for (int i = 0, mask = 1; i < n; i++, mask <<= 1)
                    {
                        if (0 != (mask & consideringVertex))
                        {
                            vertexAfterTransitionA |= gpuAs[i];
                            vertexAfterTransitionB |= gpuBs[i];
                        }
                    }

                    if (localCyclicProblemId != isDiscovered[vertexAfterTransitionA])
                    {
                        if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1)))
                        {
                            discoveredSingleton    = true;
                            firstSingletonDistance = currentNextDistance;
                            break;
                        }

                        isDiscovered[vertexAfterTransitionA]         = localCyclicProblemId;
                        queueItself[queueWritingIndexModQueueSize++] = vertexAfterTransitionA;
                        if (queueWritingIndexModQueueSize == queueUpperBound)
                        {
                            queueWritingIndexTotals++;
                            queueWritingIndexModQueueSize = 0;
                        }

                        if (seekingFirstNext)
                        {
                            seekingFirstNext  = false;
                            verticesUntilBump = (queueWritingIndexTotals - queueReadingIndexTotals) * queueUpperBound + (queueWritingIndexModQueueSize - queueReadingIndexModQueueSize);
                        }
                    }

                    if (localCyclicProblemId != isDiscovered[vertexAfterTransitionB])
                    {
                        if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1)))
                        {
                            discoveredSingleton    = true;
                            firstSingletonDistance = currentNextDistance;
                            break;
                        }

                        isDiscovered[vertexAfterTransitionB]         = localCyclicProblemId;
                        queueItself[queueWritingIndexModQueueSize++] = vertexAfterTransitionB;
                        if (queueWritingIndexModQueueSize == queueUpperBound)
                        {
                            queueWritingIndexTotals++;
                            queueWritingIndexModQueueSize = 0;
                        }

                        if (seekingFirstNext)
                        {
                            seekingFirstNext  = false;
                            verticesUntilBump = (queueWritingIndexTotals - queueReadingIndexTotals) * queueUpperBound + (queueWritingIndexModQueueSize - queueReadingIndexModQueueSize);
                        }
                    }
                }
                if (discoveredSingleton)
                {
                    isSynchronizing[ac] = true;
                    shortestSynchronizingWordLength[ac] = firstSingletonDistance;
                }
            }
        }
Exemplo n.º 7
0
        public static void Kernel(
            int[] precomputedStateTransitioningMatrixA,
            int[] precomputedStateTransitioningMatrixB,
            bool[] isSynchronizing,
            int[] shortestSynchronizingWordLength)
        {
            var n          = problemSize.Value;
            var arrayCount = precomputedStateTransitioningMatrixA.Length / n;
            var power      = 1 << n;

            #region Pointer setup
            var byteOffset = 0;

            var minEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                          .Ptr(byteOffset / sizeof(int));
            byteOffset += blockDim.x * sizeof(int);

            var minOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                         .Ptr(byteOffset / sizeof(int));
            byteOffset += blockDim.x * sizeof(int);

            var maxEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                          .Ptr(byteOffset / sizeof(int));
            byteOffset += blockDim.x * sizeof(int);

            var maxOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                         .Ptr(byteOffset / sizeof(int));
            byteOffset += blockDim.x * sizeof(int);

            var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                       .Ptr(byteOffset / sizeof(ushort))
                       .Volatile();
            byteOffset += n * sizeof(ushort);

            var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                       .Ptr(byteOffset / sizeof(ushort))
                       .Volatile();
            byteOffset += n * sizeof(ushort);

            var isActiveEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                               .Ptr(byteOffset / sizeof(bool));
            byteOffset += power * sizeof(bool);

            var isActiveOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                              .Ptr(byteOffset / sizeof(bool));
            byteOffset += power * sizeof(bool);

            var isDiscovered = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                               .Ptr(byteOffset / sizeof(bool));
            byteOffset += power * sizeof(bool);

            var addedAnythingOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                                   .Ptr(byteOffset / sizeof(bool))
                                   .Volatile();
            byteOffset += sizeof(bool);

            var addedAnythingEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                                    .Ptr(byteOffset / sizeof(bool))
                                    .Volatile();
            byteOffset += sizeof(bool);

            var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                             .Ptr(byteOffset / sizeof(bool))
                             .Volatile();
            byteOffset += sizeof(bool);
            #endregion


            ushort nextDistance;
            int    vertexAfterTransitionA,
                   vertexAfterTransitionB,
                   index;
            var acPart  = (arrayCount + gridDim.x - 1) / gridDim.x;
            var acBegin = blockIdx.x * acPart;
            var acEnd   = acBegin + acPart;
            if (arrayCount < acEnd)
            {
                acEnd = arrayCount;
            }
            index = acBegin * n;
            var threadWork = (power + blockDim.x - 1) / blockDim.x;
            DeviceFunction.SyncThreads();
            for (int ac = acBegin; ac < acEnd; ac++, index += n)
            {
                //Console.WriteLine("Begin {0}", threadIdx.x);
                // cleanup
                var readingActive        = isActiveOdd;
                var writingActive        = isActiveEven;
                var readingAnythingAdded = addedAnythingOdd;
                var writingAnythingAdded = addedAnythingEven;
                var minRead  = minOdd;
                var minWrite = minEven;
                var maxRead  = maxOdd;
                var maxWrite = maxEven;

                if (threadIdx.x == DeviceFunction.WarpSize || (threadIdx.x == 0 && blockDim.x <= DeviceFunction.WarpSize))
                {
                    for (int i = 0; i < n; i++)
                    {
                        gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]);
                        gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]);
                    }
                }

                minRead[threadIdx.x]  = int.MaxValue;
                maxRead[threadIdx.x]  = 0;
                minWrite[threadIdx.x] = int.MaxValue;
                maxWrite[threadIdx.x] = 0;
                if (threadIdx.x == 0)
                {
                    shouldStop[0]           = false;
                    readingAnythingAdded[0] = true;
                }
                else if (threadIdx.x == blockDim.x - 1)
                {
                    minRead[blockDim.x - 1] = (power - 1) % threadWork;
                    maxRead[blockDim.x - 1] = power % threadWork;
                }
                nextDistance = 1;
                for (int consideringVertex = threadIdx.x, endingVertex = power, powerm1 = power - 1;
                     consideringVertex < endingVertex;
                     consideringVertex += blockDim.x)
                {
                    isActiveEven[consideringVertex] = false;
                    isActiveOdd[consideringVertex]  = consideringVertex == powerm1;
                    isDiscovered[consideringVertex] = consideringVertex == powerm1;
                    //Console.WriteLine("cleaning {0}, {1} {2} {3}", consideringVertex,
                    //    readingActive[consideringVertex], writingActive[consideringVertex], isDiscovered[consideringVertex]);
                }
                DeviceFunction.SyncThreads();

                while (readingAnythingAdded[0] && !shouldStop[0])
                {
                    if (threadIdx.x == 0)
                    {
                        readingAnythingAdded[0] = false;
                        writingAnythingAdded[0] = false;
                        //Console.WriteLine("distance {0}", nextDistance);
                    }

                    int myPart           = (power + blockDim.x - 1) / blockDim.x;
                    int beginningPointer = threadIdx.x * myPart;
                    int endingPointer    = beginningPointer + myPart;
                    if (power < endingPointer)
                    {
                        endingPointer = power;
                    }
                    if (minRead[threadIdx.x] > beginningPointer)
                    {
                        beginningPointer = minRead[threadIdx.x];
                    }
                    //if (maxRead[threadIdx.x] < endingPointer)
                    //    endingPointer = maxRead[threadIdx.x];
                    minRead[threadIdx.x]  = int.MaxValue;
                    maxRead[threadIdx.x]  = 0;
                    minWrite[threadIdx.x] = int.MaxValue;
                    maxWrite[threadIdx.x] = 0;
                    DeviceFunction.SyncThreads();
                    //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac);
                    for (int consideringVertex = beginningPointer; consideringVertex < endingPointer; ++consideringVertex)
                    {
                        //Console.WriteLine("writeIsActive[{0}]=={1}", consideringVertex, writingActive[consideringVertex]);
                        if (!readingActive[consideringVertex])
                        {
                            //Console.WriteLine("Skipping {0}", consideringVertex);
                            continue;
                        }
                        else
                        {
                            //Console.WriteLine("Considering {0} dist {1}", consideringVertex, nextDistance);
                            readingActive[consideringVertex] = false;
                        }

                        vertexAfterTransitionA = vertexAfterTransitionB = 0;
                        for (int i = 0, mask = 1; i < n; i++, mask <<= 1)
                        {
                            if (0 != (mask & consideringVertex))
                            {
                                vertexAfterTransitionA |= gpuA[i];
                                vertexAfterTransitionB |= gpuB[i];
                            }
                        }

                        if (!isDiscovered[vertexAfterTransitionA])
                        {
                            isDiscovered[vertexAfterTransitionA] = true;
                            //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionA, consideringVertex);
                            DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionA / threadWork), vertexAfterTransitionA % threadWork);
                            DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionA / threadWork), 1 + (vertexAfterTransitionA % threadWork));

                            if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1)))
                            {
                                shortestSynchronizingWordLength[ac] = nextDistance;
                                isSynchronizing[ac] = true;
                                shouldStop[0]       = true;
                                break;
                            }

                            writingActive[vertexAfterTransitionA] = true;
                            writingAnythingAdded[0] = true;
                        }

                        if (!isDiscovered[vertexAfterTransitionB])
                        {
                            isDiscovered[vertexAfterTransitionB] = true;
                            //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionB, consideringVertex);
                            DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionB / threadWork), vertexAfterTransitionB % threadWork);
                            DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionB / threadWork), 1 + (vertexAfterTransitionB % threadWork));

                            if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1)))
                            {
                                shortestSynchronizingWordLength[ac] = nextDistance;
                                isSynchronizing[ac] = true;
                                shouldStop[0]       = true;
                                break;
                            }

                            writingActive[vertexAfterTransitionB] = true;
                            writingAnythingAdded[0] = true;
                        }
                    }
                    ++nextDistance;
                    readingActive        = nextDistance % 2 == 0 ? isActiveEven : isActiveOdd;
                    writingActive        = nextDistance % 2 != 0 ? isActiveEven : isActiveOdd;
                    readingAnythingAdded = nextDistance % 2 == 0 ? addedAnythingEven : addedAnythingOdd;
                    writingAnythingAdded = nextDistance % 2 != 0 ? addedAnythingEven : addedAnythingOdd;
                    minRead  = nextDistance % 2 == 0 ? minEven : minOdd;
                    minWrite = nextDistance % 2 != 0 ? minEven : minOdd;
                    maxRead  = nextDistance % 2 == 0 ? maxEven : maxOdd;
                    maxWrite = nextDistance % 2 != 0 ? maxEven : maxOdd;
                    DeviceFunction.SyncThreads();
                }
            }
        }