public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] isSynchronizing, int[] shortestSynchronizingWordLength) { var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; #region Pointer setup var byteOffset = 0; var minEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var minOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var maxEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var maxOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var isActiveEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var isActiveOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var isDiscovered = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var addedAnythingOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); var addedAnythingEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); #endregion ushort nextDistance; int vertexAfterTransitionA, vertexAfterTransitionB, index; var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } index = acBegin * n; var threadWork = (power + blockDim.x - 1) / blockDim.x; DeviceFunction.SyncThreads(); for (int ac = acBegin; ac < acEnd; ac++, index += n) { //Console.WriteLine("Begin {0}", threadIdx.x); // cleanup var readingActive = isActiveOdd; var writingActive = isActiveEven; var readingAnythingAdded = addedAnythingOdd; var writingAnythingAdded = addedAnythingEven; var minRead = minOdd; var minWrite = minEven; var maxRead = maxOdd; var maxWrite = maxEven; if (threadIdx.x == DeviceFunction.WarpSize || (threadIdx.x == 0 && blockDim.x <= DeviceFunction.WarpSize)) { for (int i = 0; i < n; i++) { gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]); gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]); } } minRead[threadIdx.x] = int.MaxValue; maxRead[threadIdx.x] = 0; minWrite[threadIdx.x] = int.MaxValue; maxWrite[threadIdx.x] = 0; if (threadIdx.x == 0) { shouldStop[0] = false; readingAnythingAdded[0] = true; } else if (threadIdx.x == blockDim.x - 1) { minRead[blockDim.x - 1] = (power - 1) % threadWork; maxRead[blockDim.x - 1] = power % threadWork; } nextDistance = 1; for (int consideringVertex = threadIdx.x, endingVertex = power, powerm1 = power - 1; consideringVertex < endingVertex; consideringVertex += blockDim.x) { isActiveEven[consideringVertex] = false; isActiveOdd[consideringVertex] = consideringVertex == powerm1; isDiscovered[consideringVertex] = consideringVertex == powerm1; //Console.WriteLine("cleaning {0}, {1} {2} {3}", consideringVertex, // readingActive[consideringVertex], writingActive[consideringVertex], isDiscovered[consideringVertex]); } DeviceFunction.SyncThreads(); while (readingAnythingAdded[0] && !shouldStop[0]) { if (threadIdx.x == 0) { readingAnythingAdded[0] = false; writingAnythingAdded[0] = false; //Console.WriteLine("distance {0}", nextDistance); } int myPart = (power + blockDim.x - 1) / blockDim.x; int beginningPointer = threadIdx.x * myPart; int endingPointer = beginningPointer + myPart; if (power < endingPointer) { endingPointer = power; } if (minRead[threadIdx.x] > beginningPointer) { beginningPointer = minRead[threadIdx.x]; } //if (maxRead[threadIdx.x] < endingPointer) // endingPointer = maxRead[threadIdx.x]; minRead[threadIdx.x] = int.MaxValue; maxRead[threadIdx.x] = 0; minWrite[threadIdx.x] = int.MaxValue; maxWrite[threadIdx.x] = 0; DeviceFunction.SyncThreads(); //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac); for (int consideringVertex = beginningPointer; consideringVertex < endingPointer; ++consideringVertex) { //Console.WriteLine("writeIsActive[{0}]=={1}", consideringVertex, writingActive[consideringVertex]); if (!readingActive[consideringVertex]) { //Console.WriteLine("Skipping {0}", consideringVertex); continue; } else { //Console.WriteLine("Considering {0} dist {1}", consideringVertex, nextDistance); readingActive[consideringVertex] = false; } vertexAfterTransitionA = vertexAfterTransitionB = 0; for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransitionA |= gpuA[i]; vertexAfterTransitionB |= gpuB[i]; } } if (!isDiscovered[vertexAfterTransitionA]) { isDiscovered[vertexAfterTransitionA] = true; //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionA, consideringVertex); DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionA / threadWork), vertexAfterTransitionA % threadWork); DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionA / threadWork), 1 + (vertexAfterTransitionA % threadWork)); if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingActive[vertexAfterTransitionA] = true; writingAnythingAdded[0] = true; } if (!isDiscovered[vertexAfterTransitionB]) { isDiscovered[vertexAfterTransitionB] = true; //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionB, consideringVertex); DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionB / threadWork), vertexAfterTransitionB % threadWork); DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionB / threadWork), 1 + (vertexAfterTransitionB % threadWork)); if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingActive[vertexAfterTransitionB] = true; writingAnythingAdded[0] = true; } } ++nextDistance; readingActive = nextDistance % 2 == 0 ? isActiveEven : isActiveOdd; writingActive = nextDistance % 2 != 0 ? isActiveEven : isActiveOdd; readingAnythingAdded = nextDistance % 2 == 0 ? addedAnythingEven : addedAnythingOdd; writingAnythingAdded = nextDistance % 2 != 0 ? addedAnythingEven : addedAnythingOdd; minRead = nextDistance % 2 == 0 ? minEven : minOdd; minWrite = nextDistance % 2 != 0 ? minEven : minOdd; maxRead = nextDistance % 2 == 0 ? maxEven : maxOdd; maxWrite = nextDistance % 2 != 0 ? maxEven : maxOdd; DeviceFunction.SyncThreads(); } } }