public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] isSynchronizing, int[] shortestSynchronizingWordLength) { var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; const int bitSize = 6; int twoToBitsize = (1 << bitSize) - 1; var wordCount = (8 * sizeof(int) / bitSize); #region Pointer setup var byteOffset = 0; var queueEvenCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var readingQueueIndex = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var queueOddCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var gpuAB = DeviceFunction.AddressOfArray(__shared__.ExternArray <uint>()) .Ptr(byteOffset / sizeof(uint)) .Volatile(); byteOffset += (n + 1) * sizeof(uint); // must be the last among ints var isDiscoveredPtr = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); var complexOffset = (power * sizeof(int) + wordCount - 1) / wordCount; byteOffset += complexOffset + (((complexOffset % sizeof(int)) & 1) == 1 ? 1 : 0); var queueEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += (power / 2 + 1) * sizeof(ushort); var queueOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += (power / 2 + 1) * sizeof(ushort); var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); #endregion ushort nextDistance; ushort vertexAfterTransitionA, vertexAfterTransitionB; uint vertexAfterTransition; var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } var index = acBegin * n; var maskN = (1 << n) - 1; for (int ac = acBegin; ac < acEnd; ac++, index += n) { // cleanup for (int consideringVertex = threadIdx.x, endingVertex = (power - 1) / wordCount; consideringVertex < endingVertex; consideringVertex += blockDim.x) { isDiscoveredPtr[consideringVertex] = 0; } if (threadIdx.x < n) { gpuAB[threadIdx.x + 1] = (uint)( (1 << (n + precomputedStateTransitioningMatrixA[index + threadIdx.x])) | (1 << precomputedStateTransitioningMatrixB[index + threadIdx.x]) ); } else if (threadIdx.x == n) { gpuAB[0] = 0; readingQueueIndex[0] = 0; shouldStop[0] = false; queueEvenCount[0] = 0; queueOddCount[0] = 1; queueOdd[0] = (ushort)(power - 1); // assuming n >= 2 isDiscoveredPtr[(power - 1) / wordCount] = 1 << (((power - 1) % wordCount) * bitSize); } var readingQueue = queueOdd; var writingQueue = queueEven; var readingQueueCount = queueOddCount; var writingQueueIndex = queueEvenCount; nextDistance = 1; int readingQueueCountCached = 1; DeviceFunction.SyncThreads(); while (readingQueueCountCached > 0 && !shouldStop[0]) { //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac); while (readingQueueIndex[0] < readingQueueCountCached) { var iter = DeviceFunction.AtomicAdd(readingQueueIndex, 1); if (iter >= readingQueueCountCached) { break; } int consideringVertex = readingQueue[iter]; vertexAfterTransition = 0; for (int i = 1; i <= n; i++, consideringVertex >>= 1) { vertexAfterTransition |= gpuAB[i * (1 & consideringVertex)]; } vertexAfterTransitionA = (ushort)(vertexAfterTransition >> n); vertexAfterTransitionB = (ushort)(vertexAfterTransition & maskN); var isDiscoveredOffset = (vertexAfterTransitionA % wordCount) * bitSize; if (0 == (isDiscoveredPtr[vertexAfterTransitionA / wordCount] & (twoToBitsize << isDiscoveredOffset))) { // should have used AtomicOr (which is not available in AleaGPU - very unfortunate) var beforeAdded = DeviceFunction.AtomicAdd( isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount), 1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset); if (0 == beforeAdded) { if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)] = (ushort)vertexAfterTransitionA; } else { DeviceFunction.AtomicSub( isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount), 1 << isDiscoveredOffset); } } isDiscoveredOffset = (vertexAfterTransitionB % wordCount) * bitSize; if (0 == (isDiscoveredPtr[vertexAfterTransitionB / wordCount] & (twoToBitsize << isDiscoveredOffset))) { var beforeAdded = DeviceFunction.AtomicAdd( isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount), 1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset); if (0 == beforeAdded) { if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)] = (ushort)vertexAfterTransitionB; } else { DeviceFunction.AtomicSub( isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount), 1 << isDiscoveredOffset); } } } DeviceFunction.SyncThreads(); ++nextDistance; readingQueue = nextDistance % 2 == 0 ? queueEven : queueOdd; writingQueue = nextDistance % 2 != 0 ? queueEven : queueOdd; readingQueueCount = nextDistance % 2 == 0 ? queueEvenCount : queueOddCount; writingQueueIndex = nextDistance % 2 != 0 ? queueEvenCount : queueOddCount; readingQueueCountCached = nextDistance % 2 == 0 ? queueEvenCount[0] : queueOddCount[0]; if (threadIdx.x == 0) { writingQueueIndex[0] = 0; readingQueueIndex[0] = 0; } DeviceFunction.SyncThreads(); } } }