public static void Kernel(
            int[] precomputedStateTransitioningMatrixA,
            int[] precomputedStateTransitioningMatrixB,
            bool[] isSynchronizing,
            int[] shortestSynchronizingWordLength)
        {
            var       n            = problemSize.Value;
            var       arrayCount   = precomputedStateTransitioningMatrixA.Length / n;
            var       power        = 1 << n;
            const int bitSize      = 6;
            int       twoToBitsize = (1 << bitSize) - 1;
            var       wordCount    = (8 * sizeof(int) / bitSize);

            #region Pointer setup
            var byteOffset = 0;

            var queueEvenCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                 .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);

            var readingQueueIndex = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                    .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);

            var queueOddCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                .Ptr(byteOffset / sizeof(int));
            byteOffset += sizeof(int);


            var gpuAB = DeviceFunction.AddressOfArray(__shared__.ExternArray <uint>())
                        .Ptr(byteOffset / sizeof(uint))
                        .Volatile();
            byteOffset += (n + 1) * sizeof(uint);

            // must be the last among ints
            var isDiscoveredPtr = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>())
                                  .Ptr(byteOffset / sizeof(int));
            var complexOffset = (power * sizeof(int) + wordCount - 1) / wordCount;
            byteOffset += complexOffset + (((complexOffset % sizeof(int)) & 1) == 1 ? 1 : 0);

            var queueEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                            .Ptr(byteOffset / sizeof(ushort))
                            .Volatile();
            byteOffset += (power / 2 + 1) * sizeof(ushort);

            var queueOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>())
                           .Ptr(byteOffset / sizeof(ushort))
                           .Volatile();
            byteOffset += (power / 2 + 1) * sizeof(ushort);

            var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>())
                             .Ptr(byteOffset / sizeof(bool))
                             .Volatile();
            byteOffset += sizeof(bool);
            #endregion

            ushort nextDistance;
            ushort vertexAfterTransitionA,
                   vertexAfterTransitionB;
            uint vertexAfterTransition;
            var  acPart  = (arrayCount + gridDim.x - 1) / gridDim.x;
            var  acBegin = blockIdx.x * acPart;
            var  acEnd   = acBegin + acPart;
            if (arrayCount < acEnd)
            {
                acEnd = arrayCount;
            }
            var index = acBegin * n;
            var maskN = (1 << n) - 1;

            for (int ac = acBegin; ac < acEnd; ac++, index += n)
            {
                // cleanup
                for (int consideringVertex = threadIdx.x, endingVertex = (power - 1) / wordCount;
                     consideringVertex < endingVertex;
                     consideringVertex += blockDim.x)
                {
                    isDiscoveredPtr[consideringVertex] = 0;
                }

                if (threadIdx.x < n)
                {
                    gpuAB[threadIdx.x + 1] = (uint)(
                        (1 << (n + precomputedStateTransitioningMatrixA[index + threadIdx.x]))
                        | (1 << precomputedStateTransitioningMatrixB[index + threadIdx.x])
                        );
                }
                else if (threadIdx.x == n)
                {
                    gpuAB[0]             = 0;
                    readingQueueIndex[0] = 0;
                    shouldStop[0]        = false;
                    queueEvenCount[0]    = 0;
                    queueOddCount[0]     = 1;
                    queueOdd[0]          = (ushort)(power - 1);
                    // assuming n >= 2
                    isDiscoveredPtr[(power - 1) / wordCount] = 1 << (((power - 1) % wordCount) * bitSize);
                }

                var readingQueue      = queueOdd;
                var writingQueue      = queueEven;
                var readingQueueCount = queueOddCount;
                var writingQueueIndex = queueEvenCount;

                nextDistance = 1;
                int readingQueueCountCached = 1;
                DeviceFunction.SyncThreads();

                while (readingQueueCountCached > 0 && !shouldStop[0])
                {
                    //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac);
                    while (readingQueueIndex[0] < readingQueueCountCached)
                    {
                        var iter = DeviceFunction.AtomicAdd(readingQueueIndex, 1);
                        if (iter >= readingQueueCountCached)
                        {
                            break;
                        }

                        int consideringVertex = readingQueue[iter];

                        vertexAfterTransition = 0;
                        for (int i = 1; i <= n; i++, consideringVertex >>= 1)
                        {
                            vertexAfterTransition |= gpuAB[i * (1 & consideringVertex)];
                        }

                        vertexAfterTransitionA = (ushort)(vertexAfterTransition >> n);
                        vertexAfterTransitionB = (ushort)(vertexAfterTransition & maskN);

                        var isDiscoveredOffset = (vertexAfterTransitionA % wordCount) * bitSize;

                        if (0 == (isDiscoveredPtr[vertexAfterTransitionA / wordCount] & (twoToBitsize << isDiscoveredOffset)))
                        {
                            // should have used AtomicOr (which is not available in AleaGPU - very unfortunate)
                            var beforeAdded = DeviceFunction.AtomicAdd(
                                isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount),
                                1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset);

                            if (0 == beforeAdded)
                            {
                                if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1)))
                                {
                                    shortestSynchronizingWordLength[ac] = nextDistance;
                                    isSynchronizing[ac] = true;
                                    shouldStop[0]       = true;
                                    break;
                                }

                                writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)]
                                    = (ushort)vertexAfterTransitionA;
                            }
                            else
                            {
                                DeviceFunction.AtomicSub(
                                    isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount),
                                    1 << isDiscoveredOffset);
                            }
                        }

                        isDiscoveredOffset = (vertexAfterTransitionB % wordCount) * bitSize;
                        if (0 == (isDiscoveredPtr[vertexAfterTransitionB / wordCount] & (twoToBitsize << isDiscoveredOffset)))
                        {
                            var beforeAdded = DeviceFunction.AtomicAdd(
                                isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount),
                                1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset);

                            if (0 == beforeAdded)
                            {
                                if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1)))
                                {
                                    shortestSynchronizingWordLength[ac] = nextDistance;
                                    isSynchronizing[ac] = true;
                                    shouldStop[0]       = true;
                                    break;
                                }

                                writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)]
                                    = (ushort)vertexAfterTransitionB;
                            }
                            else
                            {
                                DeviceFunction.AtomicSub(
                                    isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount),
                                    1 << isDiscoveredOffset);
                            }
                        }
                    }
                    DeviceFunction.SyncThreads();
                    ++nextDistance;
                    readingQueue            = nextDistance % 2 == 0 ? queueEven : queueOdd;
                    writingQueue            = nextDistance % 2 != 0 ? queueEven : queueOdd;
                    readingQueueCount       = nextDistance % 2 == 0 ? queueEvenCount : queueOddCount;
                    writingQueueIndex       = nextDistance % 2 != 0 ? queueEvenCount : queueOddCount;
                    readingQueueCountCached = nextDistance % 2 == 0 ? queueEvenCount[0] : queueOddCount[0];
                    if (threadIdx.x == 0)
                    {
                        writingQueueIndex[0] = 0;
                        readingQueueIndex[0] = 0;
                    }
                    DeviceFunction.SyncThreads();
                }
            }
        }