private static void AleaKernelFloat2( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, int c, int n, int pitch) { // Same as KernelSharedMemory, but one thread does two element in one by using float2 reads. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); var line = threadIdx.x / (blockDim.x / 2); var tid = threadIdx.x % (blockDim.x / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>(); for (int i = line; i < blockDim.x && bI + i < n; i += 2) { Real dist0 = 0; Real dist1 = 0; for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist0 += diff.x * diff.x; dist1 += diff.y * diff.y; } mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 0)] = dist0; mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 1)] = dist1; } } }
private static void AleaKernelConstants( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, Constant <int> c, int n, int pitch) { // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant. // Also, we write the results as float2. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c.Value * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c.Value; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); var line = threadIdx.x / (blockDim.x / 2); var tid = threadIdx.x % (blockDim.x / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>(); for (int i = line; i < blockDim.x && bI + i < n; i += 2) { var dist = default(Real2); for (int k = 0; k != c.Value; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist.x += diff.x * diff.x; dist.y += diff.y * diff.y; } var dst = mSquaredDistances.Ptr((bI + i) * pitch + bJ).Reinterpret <Real2>(); dst[tid] = dist; } } }
private static void AleaKernelSharedMemory( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, int c, int n, int pitch) { // We've got shared memory of two vector of K dimensions for B points: // // var coordI = __shared__ new Real[k*blockDim.x]; // var coordJ = __shared__ new Real[k*blockDim.x]; // // We fill in these two vectors with the coordinates of the I points and J points. // Afterwards, the current block will compute the euclidean distances between all // the I points and J points, producing a square matrix [B, B]. // // This optimisation means that when producing the square matrix, the I and J points // coordinates are only read once. // // This optimisation works well if K is small enough. Otherwise the shared memory is // too small and not enough blocks get schedule per SM. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); if (bJ + threadIdx.x < n) { for (int i = 0; i < blockDim.x && bI + i < n; ++i) { Real dist = 0; for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; //mCoordinates[k * x + i]; var coord2 = coordinatesJ[k * blockDim.x + threadIdx.x]; //mCoordinates[k * x + j]; var diff = coord1 - coord2; dist += diff * diff; } mSquaredDistances[(bI + i) * pitch + (bJ + threadIdx.x)] = dist; } } }
public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] statusOfSynchronization) { // the status might be YES, NO and DUNNO (aleaGPU enum???) // TODO: change this Kernel and computation! var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; #region Pointer setup var byteOffset = 0; var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); #endregion var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } var index = acBegin * n; for (int ac = acBegin; ac < acEnd; ac++, index += n) { DeviceFunction.SyncThreads(); if (threadIdx.x == 0) { for (int i = 0; i < n; i++) { gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]); gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]); } } var pathMask = threadIdx.x; int vertexAfterTransition; var consideringVertex = power - 1; DeviceFunction.SyncThreads(); for (int iter = 0; iter < 9; iter++, pathMask >>= 1) { vertexAfterTransition = 0; if ((pathMask & 1) == 0) { for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransition |= gpuA[i]; } } } else { for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransition |= gpuB[i]; } } } consideringVertex = vertexAfterTransition; } var singleVertex = DeviceFunction.Any(0 == (consideringVertex & (consideringVertex - 1))); if (singleVertex && threadIdx.x % DeviceFunction.WarpSize == 0) { statusOfSynchronization[ac] = true; } } }
public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] isSynchronizing, int[] shortestSynchronizingWordLength) { var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; const int bitSize = 6; int twoToBitsize = (1 << bitSize) - 1; var wordCount = (8 * sizeof(int) / bitSize); #region Pointer setup var byteOffset = 0; var queueEvenCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var readingQueueIndex = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var queueOddCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var gpuAB = DeviceFunction.AddressOfArray(__shared__.ExternArray <uint>()) .Ptr(byteOffset / sizeof(uint)) .Volatile(); byteOffset += (n + 1) * sizeof(uint); // must be the last among ints var isDiscoveredPtr = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); var complexOffset = (power * sizeof(int) + wordCount - 1) / wordCount; byteOffset += complexOffset + (((complexOffset % sizeof(int)) & 1) == 1 ? 1 : 0); var queueEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += (power / 2 + 1) * sizeof(ushort); var queueOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += (power / 2 + 1) * sizeof(ushort); var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); #endregion ushort nextDistance; ushort vertexAfterTransitionA, vertexAfterTransitionB; uint vertexAfterTransition; var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } var index = acBegin * n; var maskN = (1 << n) - 1; for (int ac = acBegin; ac < acEnd; ac++, index += n) { // cleanup for (int consideringVertex = threadIdx.x, endingVertex = (power - 1) / wordCount; consideringVertex < endingVertex; consideringVertex += blockDim.x) { isDiscoveredPtr[consideringVertex] = 0; } if (threadIdx.x < n) { gpuAB[threadIdx.x + 1] = (uint)( (1 << (n + precomputedStateTransitioningMatrixA[index + threadIdx.x])) | (1 << precomputedStateTransitioningMatrixB[index + threadIdx.x]) ); } else if (threadIdx.x == n) { gpuAB[0] = 0; readingQueueIndex[0] = 0; shouldStop[0] = false; queueEvenCount[0] = 0; queueOddCount[0] = 1; queueOdd[0] = (ushort)(power - 1); // assuming n >= 2 isDiscoveredPtr[(power - 1) / wordCount] = 1 << (((power - 1) % wordCount) * bitSize); } var readingQueue = queueOdd; var writingQueue = queueEven; var readingQueueCount = queueOddCount; var writingQueueIndex = queueEvenCount; nextDistance = 1; int readingQueueCountCached = 1; DeviceFunction.SyncThreads(); while (readingQueueCountCached > 0 && !shouldStop[0]) { //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac); while (readingQueueIndex[0] < readingQueueCountCached) { var iter = DeviceFunction.AtomicAdd(readingQueueIndex, 1); if (iter >= readingQueueCountCached) { break; } int consideringVertex = readingQueue[iter]; vertexAfterTransition = 0; for (int i = 1; i <= n; i++, consideringVertex >>= 1) { vertexAfterTransition |= gpuAB[i * (1 & consideringVertex)]; } vertexAfterTransitionA = (ushort)(vertexAfterTransition >> n); vertexAfterTransitionB = (ushort)(vertexAfterTransition & maskN); var isDiscoveredOffset = (vertexAfterTransitionA % wordCount) * bitSize; if (0 == (isDiscoveredPtr[vertexAfterTransitionA / wordCount] & (twoToBitsize << isDiscoveredOffset))) { // should have used AtomicOr (which is not available in AleaGPU - very unfortunate) var beforeAdded = DeviceFunction.AtomicAdd( isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount), 1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset); if (0 == beforeAdded) { if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)] = (ushort)vertexAfterTransitionA; } else { DeviceFunction.AtomicSub( isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount), 1 << isDiscoveredOffset); } } isDiscoveredOffset = (vertexAfterTransitionB % wordCount) * bitSize; if (0 == (isDiscoveredPtr[vertexAfterTransitionB / wordCount] & (twoToBitsize << isDiscoveredOffset))) { var beforeAdded = DeviceFunction.AtomicAdd( isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount), 1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset); if (0 == beforeAdded) { if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)] = (ushort)vertexAfterTransitionB; } else { DeviceFunction.AtomicSub( isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount), 1 << isDiscoveredOffset); } } } DeviceFunction.SyncThreads(); ++nextDistance; readingQueue = nextDistance % 2 == 0 ? queueEven : queueOdd; writingQueue = nextDistance % 2 != 0 ? queueEven : queueOdd; readingQueueCount = nextDistance % 2 == 0 ? queueEvenCount : queueOddCount; writingQueueIndex = nextDistance % 2 != 0 ? queueEvenCount : queueOddCount; readingQueueCountCached = nextDistance % 2 == 0 ? queueEvenCount[0] : queueOddCount[0]; if (threadIdx.x == 0) { writingQueueIndex[0] = 0; readingQueueIndex[0] = 0; } DeviceFunction.SyncThreads(); } } }
public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] isSynchronizing, int[] shortestSynchronizingWordLength) { if (threadIdx.x % DeviceFunction.WarpSize != 0) { return; } var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var powerSetCount = 1 << n; var queueUpperBound = powerSetCount / 2 + 1; var initialVertex = (ushort)(powerSetCount - 1); var warpCount = blockDim.x / DeviceFunction.WarpSize; var threadOffset = threadIdx.x / DeviceFunction.WarpSize; #region Pointer setup var byteOffset = 0; var queueItself = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Ptr(threadOffset * queueUpperBound); byteOffset += sizeof(ushort) * queueUpperBound * warpCount; var gpuAs = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Ptr(threadOffset * n) .Volatile(); byteOffset += sizeof(ushort) * n * warpCount; var gpuBs = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Ptr(threadOffset * n) .Volatile(); byteOffset += sizeof(ushort) * n * warpCount; var isDiscovered = DeviceFunction.AddressOfArray(__shared__.ExternArray <byte>()) .Ptr(byteOffset / sizeof(byte)) .Ptr(threadOffset * powerSetCount); byteOffset += sizeof(byte) * powerSetCount * warpCount; #endregion int index; var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } index = acBegin * n; int queueReadingIndexModQueueSize = 0, queueWritingIndexModQueueSize = 0; int queueReadingIndexTotals = 0, queueWritingIndexTotals = 0; byte localCyclicProblemId = 1; for (int ac = acBegin; ac < acEnd; ac++, index += n, localCyclicProblemId++) { queueReadingIndexTotals = queueWritingIndexTotals = 0; queueReadingIndexModQueueSize = queueWritingIndexModQueueSize = 0; if (localCyclicProblemId == 0) { for (int i = 0; i < powerSetCount; i++) { isDiscovered[i] = 0; } localCyclicProblemId = 1; } for (int i = 0; i < n; i++) { gpuAs[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]); gpuBs[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]); } // the queue is surely at least 2 vertices long... // no need for modulo operations queueItself[queueWritingIndexModQueueSize++] = initialVertex; var discoveredSingleton = false; ushort consideringVertex; ushort vertexAfterTransitionA; ushort vertexAfterTransitionB; ushort firstSingletonDistance = 0; ushort currentNextDistance = 1; var verticesUntilBump = int.MaxValue; var seekingFirstNext = true; // there is something to read while (queueWritingIndexModQueueSize > queueReadingIndexModQueueSize || queueWritingIndexTotals > queueReadingIndexTotals) { consideringVertex = queueItself[queueReadingIndexModQueueSize++]; if (queueReadingIndexModQueueSize == queueUpperBound) { queueReadingIndexTotals++; queueReadingIndexModQueueSize = 0; } if (--verticesUntilBump == 0) { ++currentNextDistance; seekingFirstNext = true; } vertexAfterTransitionA = vertexAfterTransitionB = 0; for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransitionA |= gpuAs[i]; vertexAfterTransitionB |= gpuBs[i]; } } if (localCyclicProblemId != isDiscovered[vertexAfterTransitionA]) { if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1))) { discoveredSingleton = true; firstSingletonDistance = currentNextDistance; break; } isDiscovered[vertexAfterTransitionA] = localCyclicProblemId; queueItself[queueWritingIndexModQueueSize++] = vertexAfterTransitionA; if (queueWritingIndexModQueueSize == queueUpperBound) { queueWritingIndexTotals++; queueWritingIndexModQueueSize = 0; } if (seekingFirstNext) { seekingFirstNext = false; verticesUntilBump = (queueWritingIndexTotals - queueReadingIndexTotals) * queueUpperBound + (queueWritingIndexModQueueSize - queueReadingIndexModQueueSize); } } if (localCyclicProblemId != isDiscovered[vertexAfterTransitionB]) { if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1))) { discoveredSingleton = true; firstSingletonDistance = currentNextDistance; break; } isDiscovered[vertexAfterTransitionB] = localCyclicProblemId; queueItself[queueWritingIndexModQueueSize++] = vertexAfterTransitionB; if (queueWritingIndexModQueueSize == queueUpperBound) { queueWritingIndexTotals++; queueWritingIndexModQueueSize = 0; } if (seekingFirstNext) { seekingFirstNext = false; verticesUntilBump = (queueWritingIndexTotals - queueReadingIndexTotals) * queueUpperBound + (queueWritingIndexModQueueSize - queueReadingIndexModQueueSize); } } } if (discoveredSingleton) { isSynchronizing[ac] = true; shortestSynchronizingWordLength[ac] = firstSingletonDistance; } } }
public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] isSynchronizing, int[] shortestSynchronizingWordLength) { var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; #region Pointer setup var byteOffset = 0; var minEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var minOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var maxEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var maxOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var isActiveEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var isActiveOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var isDiscovered = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var addedAnythingOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); var addedAnythingEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); #endregion ushort nextDistance; int vertexAfterTransitionA, vertexAfterTransitionB, index; var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } index = acBegin * n; var threadWork = (power + blockDim.x - 1) / blockDim.x; DeviceFunction.SyncThreads(); for (int ac = acBegin; ac < acEnd; ac++, index += n) { //Console.WriteLine("Begin {0}", threadIdx.x); // cleanup var readingActive = isActiveOdd; var writingActive = isActiveEven; var readingAnythingAdded = addedAnythingOdd; var writingAnythingAdded = addedAnythingEven; var minRead = minOdd; var minWrite = minEven; var maxRead = maxOdd; var maxWrite = maxEven; if (threadIdx.x == DeviceFunction.WarpSize || (threadIdx.x == 0 && blockDim.x <= DeviceFunction.WarpSize)) { for (int i = 0; i < n; i++) { gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]); gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]); } } minRead[threadIdx.x] = int.MaxValue; maxRead[threadIdx.x] = 0; minWrite[threadIdx.x] = int.MaxValue; maxWrite[threadIdx.x] = 0; if (threadIdx.x == 0) { shouldStop[0] = false; readingAnythingAdded[0] = true; } else if (threadIdx.x == blockDim.x - 1) { minRead[blockDim.x - 1] = (power - 1) % threadWork; maxRead[blockDim.x - 1] = power % threadWork; } nextDistance = 1; for (int consideringVertex = threadIdx.x, endingVertex = power, powerm1 = power - 1; consideringVertex < endingVertex; consideringVertex += blockDim.x) { isActiveEven[consideringVertex] = false; isActiveOdd[consideringVertex] = consideringVertex == powerm1; isDiscovered[consideringVertex] = consideringVertex == powerm1; //Console.WriteLine("cleaning {0}, {1} {2} {3}", consideringVertex, // readingActive[consideringVertex], writingActive[consideringVertex], isDiscovered[consideringVertex]); } DeviceFunction.SyncThreads(); while (readingAnythingAdded[0] && !shouldStop[0]) { if (threadIdx.x == 0) { readingAnythingAdded[0] = false; writingAnythingAdded[0] = false; //Console.WriteLine("distance {0}", nextDistance); } int myPart = (power + blockDim.x - 1) / blockDim.x; int beginningPointer = threadIdx.x * myPart; int endingPointer = beginningPointer + myPart; if (power < endingPointer) { endingPointer = power; } if (minRead[threadIdx.x] > beginningPointer) { beginningPointer = minRead[threadIdx.x]; } //if (maxRead[threadIdx.x] < endingPointer) // endingPointer = maxRead[threadIdx.x]; minRead[threadIdx.x] = int.MaxValue; maxRead[threadIdx.x] = 0; minWrite[threadIdx.x] = int.MaxValue; maxWrite[threadIdx.x] = 0; DeviceFunction.SyncThreads(); //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac); for (int consideringVertex = beginningPointer; consideringVertex < endingPointer; ++consideringVertex) { //Console.WriteLine("writeIsActive[{0}]=={1}", consideringVertex, writingActive[consideringVertex]); if (!readingActive[consideringVertex]) { //Console.WriteLine("Skipping {0}", consideringVertex); continue; } else { //Console.WriteLine("Considering {0} dist {1}", consideringVertex, nextDistance); readingActive[consideringVertex] = false; } vertexAfterTransitionA = vertexAfterTransitionB = 0; for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransitionA |= gpuA[i]; vertexAfterTransitionB |= gpuB[i]; } } if (!isDiscovered[vertexAfterTransitionA]) { isDiscovered[vertexAfterTransitionA] = true; //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionA, consideringVertex); DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionA / threadWork), vertexAfterTransitionA % threadWork); DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionA / threadWork), 1 + (vertexAfterTransitionA % threadWork)); if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingActive[vertexAfterTransitionA] = true; writingAnythingAdded[0] = true; } if (!isDiscovered[vertexAfterTransitionB]) { isDiscovered[vertexAfterTransitionB] = true; //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionB, consideringVertex); DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionB / threadWork), vertexAfterTransitionB % threadWork); DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionB / threadWork), 1 + (vertexAfterTransitionB % threadWork)); if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingActive[vertexAfterTransitionB] = true; writingAnythingAdded[0] = true; } } ++nextDistance; readingActive = nextDistance % 2 == 0 ? isActiveEven : isActiveOdd; writingActive = nextDistance % 2 != 0 ? isActiveEven : isActiveOdd; readingAnythingAdded = nextDistance % 2 == 0 ? addedAnythingEven : addedAnythingOdd; writingAnythingAdded = nextDistance % 2 != 0 ? addedAnythingEven : addedAnythingOdd; minRead = nextDistance % 2 == 0 ? minEven : minOdd; minWrite = nextDistance % 2 != 0 ? minEven : minOdd; maxRead = nextDistance % 2 == 0 ? maxEven : maxOdd; maxWrite = nextDistance % 2 != 0 ? maxEven : maxOdd; DeviceFunction.SyncThreads(); } } }