//这里先对二维数组进行降维(PACK方法)处理,变成一维数组,效率会提高很多,这里直接计算出A和B的列数输入进去 private static void KernelPacked(double[] a, double[] b, double[] c, int colsA, int colsB, int colsC) { //获取当前内核的块的位置 var blockRow = blockIdx.x; var blockCol = blockIdx.y; var valueC = 0.0; //获取当前块的线程的位置 var row = threadIdx.x; var col = threadIdx.y; for (var m = 0; m < DivUp(colsA, BlockSize); ++m) { var subA = __shared__.Array2D <double>(BlockSize, BlockSize); var subB = __shared__.Array2D <double>(BlockSize, BlockSize); subA[row, col] = GetMatrixElement(colsA, a, blockRow, m, row, col); subB[row, col] = GetMatrixElement(colsB, b, m, blockCol, row, col); //同步线程,等所有线程都拿到数据再开始计算 DeviceFunction.SyncThreads(); //计算对应的行和列的乘积然后求和 for (var e = 0; e < BlockSize; ++e) { valueC += subA[row, e] * subB[e, col]; //valueC = col; } //同步线程,得出结果 DeviceFunction.SyncThreads(); } SetMatrixElement(colsC, c, blockRow, blockCol, row, col, valueC); }
private static void Kernel(long colsA, Func <long, long, T> getA, Func <long, long, T> getB, Action <long, long, T> setC, T zero, Func <T, T, T> add, Func <T, T, T> mul) { var blockRow = blockIdx.x; var blockCol = blockIdx.y; var valueC = zero; var row = threadIdx.x; var col = threadIdx.y; for (var m = 0; m < ScalarOps.DivUp(colsA, BlockSize); ++m) { var subA = __shared__.Array2D <T>(BlockSize, BlockSize); var subB = __shared__.Array2D <T>(BlockSize, BlockSize); subA[row, col] = getA(blockRow * BlockSize + row, m * BlockSize + col); subB[row, col] = getB(m * BlockSize + row, blockCol * BlockSize + col); DeviceFunction.SyncThreads(); for (var e = 0; e < BlockSize; ++e) { valueC = add(valueC, mul(subA[row, e], subB[e, col])); } DeviceFunction.SyncThreads(); } setC(blockRow * BlockSize + row, blockCol * BlockSize + col, valueC); }
private static void backward_bias_kernel(float[] biasUpdates, float[] delta, int batch, int n, int size) { var part = __shared__.Array <float>(CudaUtils.BlockSize); int i, b; int filter = blockIdx.x; int p = threadIdx.x; float sum = 0; for (b = 0; b < batch; ++b) { for (i = 0; i < size; i += CudaUtils.BlockSize) { int index = p + i + size * (filter + n * b); sum += (p + i < size) ? delta[index] : 0; } } part[p] = sum; DeviceFunction.SyncThreads(); if (p == 0) { for (i = 0; i < CudaUtils.BlockSize; ++i) { biasUpdates[filter] += part[i]; } } }
public static void Kernel(double[] positionsx, double[] positionsy, double[] velocitiesx, double[] velocitiesy, double[] accelerationsx, double[] accelerationsy, int width, int height, float mousex, float mousey, int[,] squareFish, int[] squaresStart, int[] fishInSquare, int squaresInRow, int squaresNumber, int[] bitmap) { int ind = blockIdx.x * blockDim.x + threadIdx.x; FishFunctions.AlignWithOtherFish(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind, squareFish, squaresStart, fishInSquare, squaresInRow, squaresNumber); FishFunctions.CohesionWithOtherFish(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind, squaresStart, squareFish, fishInSquare, squaresNumber, squaresInRow); FishFunctions.AvoidOtherFish(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind, squaresStart, squareFish, fishInSquare, squaresInRow, squaresNumber); if (mousex >= 0 && mousey >= 0) { FishFunctions.AvoidMouse(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind, mousex, mousey); } FishFunctions.UpdateFish(positionsx, positionsy, velocitiesx, velocitiesy, accelerationsx, accelerationsy, ind, width, height, mousex, mousey); FishFunctions.Edges(positionsx, positionsy, ind, width, height); DeviceFunction.SyncThreads(); int col = (255 << 24) + (0 << 16) + (255 << 8) + 255; int x = (int)positionsx[ind]; int y = (int)positionsy[ind]; CircleDrawing.CircleBresenham(x, y, 2, bitmap, width, height, col); }
public static void Kernel(double[] result, int n, double lb, double ub, double d) { var temp = __shared__.Array <double>(BlockSize); var start = blockIdx.x * blockDim.x + threadIdx.x; var step = gridDim.x * blockDim.x; for (int yi = start; yi < n; yi += step) { double y = (yi + 0.5) * d; double intSum = 0; for (int xi = 0; xi < n; xi++) { double x = (xi + 0.5) * d; intSum += CalkaGPU(x, y); } temp[threadIdx.x] += intSum * d * d; } DeviceFunction.SyncThreads(); if (threadIdx.x == 0) { for (int i = 0; i < BlockSize; i++) { result[blockIdx.x] += temp[i]; } } }
// ReSharper disable once SuggestBaseTypeForParameter private static void KernelSequentialReduceIdleThreads <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op) { var shared = __shared__.ExternArray <T>(); var tid = threadIdx.x; var bid = blockIdx.x; var gid = 2 * blockDim.x * bid + tid; shared[tid] = (gid < length && gid + blockDim.x < length) ? op(array[gid], array[gid + blockDim.x]) : array[gid]; DeviceFunction.SyncThreads(); for (int s = blockDim.x / 2; s > 0; s >>= 1) { if (tid < s && gid + s < length) { shared[tid] = op(shared[tid], shared[tid + s]); } DeviceFunction.SyncThreads(); } if (tid == 0) { result[bid] = shared[0]; } }
// ReSharper disable once SuggestBaseTypeForParameter private static void KernelInterleavedAccess <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op) { var shared = __shared__.ExternArray <T>(); var tid = threadIdx.x; var bid = blockIdx.x; var gid = blockDim.x * bid + tid; if (gid < length) { shared[tid] = array[gid]; } DeviceFunction.SyncThreads(); for (var s = 1; s < blockDim.x; s *= 2) { if (tid % (2 * s) == 0 && gid + s < length) { shared[tid] = op(shared[tid], shared[tid + s]); } DeviceFunction.SyncThreads(); } if (tid == 0) { result[bid] = shared[0]; } }
private static void AleaKernelConstants( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, Constant <int> c, int n, int pitch) { // Same as CudaKernelOptimised2, but the number of coordinates is given as a meta-constant. // Also, we write the results as float2. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c.Value * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c.Value; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); var line = threadIdx.x / (blockDim.x / 2); var tid = threadIdx.x % (blockDim.x / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>(); for (int i = line; i < blockDim.x && bI + i < n; i += 2) { var dist = default(Real2); for (int k = 0; k != c.Value; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist.x += diff.x * diff.x; dist.y += diff.y * diff.y; } var dst = mSquaredDistances.Ptr((bI + i) * pitch + bJ).Reinterpret <Real2>(); dst[tid] = dist; } } }
private static void AleaKernelFloat2( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, int c, int n, int pitch) { // Same as KernelSharedMemory, but one thread does two element in one by using float2 reads. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); var line = threadIdx.x / (blockDim.x / 2); var tid = threadIdx.x % (blockDim.x / 2); if (bJ + tid * 2 < n) { var coordinatesJ2 = coordinatesJ.Reinterpret <Real2>(); for (int i = line; i < blockDim.x && bI + i < n; i += 2) { Real dist0 = 0; Real dist1 = 0; for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; var coord2 = coordinatesJ2[(k * blockDim.x / 2) + tid]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist0 += diff.x * diff.x; dist1 += diff.y * diff.y; } mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 0)] = dist0; mSquaredDistances[(bI + i) * pitch + (bJ + 2 * tid + 1)] = dist1; } } }
private static void AleaKernelLocalMemory( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, Constant <int> dimX, Constant <int> c, int n, int pitch) { // Same as KernelConstants, but use both local and shared memory to increase the effective shared memory. var coordinatesI = __shared__.Array <Real>(c.Value * dimX.Value); var coordinatesJ = __local__.Array <Real2>(c.Value); var bI = blockIdx.y * dimX.Value; var bJ = blockIdx.x * dimX.Value; var line = threadIdx.x / (dimX.Value / 2); var tid = threadIdx.x % (dimX.Value / 2); var isActive = bJ + tid * 2 < n; for (int k = 0; k != c.Value; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * dimX.Value + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (isActive) { var mCoordinates2 = mCoordinates.Reinterpret <Real2>(); coordinatesJ[k] = mCoordinates2[(k * n + bJ) / 2 + tid]; } } DeviceFunction.SyncThreads(); if (isActive) { for (int i = line; i < dimX.Value && bI + i < n; i += 2) { var dist = default(Real2); for (int k = 0; k != c.Value; ++k) { var coord1 = coordinatesI[k * dimX.Value + i]; var coord2 = coordinatesJ[k]; var diff = new Real2(coord1 - coord2.x, coord1 - coord2.y); dist.x += diff.x * diff.x; dist.y += diff.y * diff.y; } var dst = mSquaredDistances.Reinterpret <Real2>(); dst[((bI + i) * pitch + bJ) / 2 + tid] = dist; } } }
// ReSharper disable once SuggestBaseTypeForParameter private static void KernelSequentialReduceIdleThreadsWarp <T>(deviceptr <T> array, int length, T[] result, Func <T, T, T> op) { var shared = __shared__.ExternArray <T>(); var tid = threadIdx.x; var bid = blockIdx.x; var bdm = blockDim.x; var gid = 2 * bdm * bid + tid; shared[tid] = (gid < length && gid + bdm < length) ? op(array[gid], array[gid + bdm]) : array[gid]; DeviceFunction.SyncThreads(); for (var s = bdm / 2; s > WarpSize; s >>= 1) { if (tid < s && gid + s < length) { shared[tid] = op(shared[tid], shared[tid + s]); } DeviceFunction.SyncThreads(); } if (tid < WarpSize) { if (bdm >= 2 * WarpSize) { shared[tid] = op(shared[tid], shared[tid + WarpSize]); } shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 16)); shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 8)); shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 4)); shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 2)); shared[tid] = op(shared[tid], DeviceFunction.ShuffleDown(shared[tid], 1)); } if (tid == 0) { result[bid] = shared[0]; } }
// ReSharper disable once SuggestBaseTypeForParameter private static void KernelSequentialReduceIdleThreadsWarpMultiple <T>(deviceptr <T> array, int length, deviceptr <T> result, Func <T, T, T> op) { var tid = threadIdx.x; var bid = blockIdx.x; var bdm = blockDim.x; var gid = bdm * bid + tid; // Todo: 'default(T)' is a bad idea, think of (n * 0) => The accumulator's initial value should be provided by the user! var accumulator = default(T); while (gid < length) { accumulator = op(accumulator, array[gid]); gid += gridDim.x * bdm; } accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 16)); accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 8)); accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 4)); accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 2)); accumulator = op(accumulator, DeviceFunction.ShuffleDown(accumulator, 1)); var shared = __shared__.Array <T>(8); if (tid % WarpSize == 0) { shared[tid / WarpSize] = accumulator; } DeviceFunction.SyncThreads(); if (tid == 0) { var a = op(op(shared[0], shared[1]), op(shared[2], shared[3])); var b = op(op(shared[4], shared[5]), op(shared[6], shared[7])); result[bid] = op(a, b); } }
internal static int[] Compute1(int[] array) /*where T : IComparable<T>*/ { var steps = array.Length % 2 == 0 ? array.Length / 2 : array.Length / 2 + 1; var gpu = Gpu.Default; var inputLength = array.Length; var inputMemory = gpu.Allocate(array); gpu.For(0, array.Length, i => { for (var k = 0; k < steps; k++) { if (i < inputLength - 1) { var c = inputMemory[i + 0]; var n = inputMemory[i + 1]; if (i % 2 == 0 && c > n) { Exchange(inputMemory, i, i + 1); } if (i % 2 != 0 && c > n) { Exchange(inputMemory, i, i + 1); } DeviceFunction.SyncThreads(); } } }); return(Gpu.CopyToHost(inputMemory)); }
//核函数:输入矩阵a和b,返回矩阵c。将输入和输出放在一起的写法 private static void Kernel(double[,] a, double[,] b, double[,] c) { var colsA = a.GetLength(1); //colsA为矩阵A的列数组 var blockRow = blockIdx.x; //二维的block行数 var blockCol = blockIdx.y; //二维的block列数 var valueC = 0.0; var row = threadIdx.x; //二维的线程行数 var col = threadIdx.y; //二维的线程列数 //这里DivUP是向上取整,相当于ceil操作。例如我们有矩阵A有33列,线程数为32, //那么我们需要多分配一个block用来计算,因此向上取整 for (var m = 0; m < DivUp(colsA, BlockSize); ++m) { //构建两个共享内存中的二维数组 var subA = __shared__.Array2D <double>(BlockSize, BlockSize); var subB = __shared__.Array2D <double>(BlockSize, BlockSize); //填充两二维数组 subA[row, col] = GetMatrixElement(a, blockRow, m, row, col); subB[row, col] = GetMatrixElement(b, m, blockCol, row, col); //同步线程,等所有线程都拿到数据再开始计算 DeviceFunction.SyncThreads(); for (var e = 0; e < BlockSize; ++e) { //计算每个线程的值 valueC += subA[row, e] * subB[e, col]; //valueC = row; } //同步线程,得出结果 DeviceFunction.SyncThreads(); } //把计算出来的值赋值给各行各列 SetMatrixElement(c, blockRow, blockCol, row, col, valueC); }
private static void AleaKernelSharedMemory( deviceptr <Real> mSquaredDistances, deviceptr <Real> mCoordinates, int c, int n, int pitch) { // We've got shared memory of two vector of K dimensions for B points: // // var coordI = __shared__ new Real[k*blockDim.x]; // var coordJ = __shared__ new Real[k*blockDim.x]; // // We fill in these two vectors with the coordinates of the I points and J points. // Afterwards, the current block will compute the euclidean distances between all // the I points and J points, producing a square matrix [B, B]. // // This optimisation means that when producing the square matrix, the I and J points // coordinates are only read once. // // This optimisation works well if K is small enough. Otherwise the shared memory is // too small and not enough blocks get schedule per SM. var shared = DeviceFunction.AddressOfArray(__shared__.ExternArray <Real>()); var coordinatesI = shared.Ptr(0); var coordinatesJ = shared.Ptr(c * blockDim.x); var bI = blockIdx.y * blockDim.x; var bJ = blockIdx.x * blockDim.x; for (int k = 0; k != c; ++k) { if (bI + threadIdx.x < n) { coordinatesI[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bI + threadIdx.x]; } if (bJ + threadIdx.x < n) { coordinatesJ[k * blockDim.x + threadIdx.x] = mCoordinates[k * n + bJ + threadIdx.x]; } } DeviceFunction.SyncThreads(); if (bJ + threadIdx.x < n) { for (int i = 0; i < blockDim.x && bI + i < n; ++i) { Real dist = 0; for (int k = 0; k != c; ++k) { var coord1 = coordinatesI[k * blockDim.x + i]; //mCoordinates[k * x + i]; var coord2 = coordinatesJ[k * blockDim.x + threadIdx.x]; //mCoordinates[k * x + j]; var diff = coord1 - coord2; dist += diff * diff; } mSquaredDistances[(bI + i) * pitch + (bJ + threadIdx.x)] = dist; } } }
public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] statusOfSynchronization) { // the status might be YES, NO and DUNNO (aleaGPU enum???) // TODO: change this Kernel and computation! var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; #region Pointer setup var byteOffset = 0; var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); #endregion var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } var index = acBegin * n; for (int ac = acBegin; ac < acEnd; ac++, index += n) { DeviceFunction.SyncThreads(); if (threadIdx.x == 0) { for (int i = 0; i < n; i++) { gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]); gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]); } } var pathMask = threadIdx.x; int vertexAfterTransition; var consideringVertex = power - 1; DeviceFunction.SyncThreads(); for (int iter = 0; iter < 9; iter++, pathMask >>= 1) { vertexAfterTransition = 0; if ((pathMask & 1) == 0) { for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransition |= gpuA[i]; } } } else { for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransition |= gpuB[i]; } } } consideringVertex = vertexAfterTransition; } var singleVertex = DeviceFunction.Any(0 == (consideringVertex & (consideringVertex - 1))); if (singleVertex && threadIdx.x % DeviceFunction.WarpSize == 0) { statusOfSynchronization[ac] = true; } } }
public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] isSynchronizing, int[] shortestSynchronizingWordLength) { var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; const int bitSize = 6; int twoToBitsize = (1 << bitSize) - 1; var wordCount = (8 * sizeof(int) / bitSize); #region Pointer setup var byteOffset = 0; var queueEvenCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var readingQueueIndex = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var queueOddCount = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += sizeof(int); var gpuAB = DeviceFunction.AddressOfArray(__shared__.ExternArray <uint>()) .Ptr(byteOffset / sizeof(uint)) .Volatile(); byteOffset += (n + 1) * sizeof(uint); // must be the last among ints var isDiscoveredPtr = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); var complexOffset = (power * sizeof(int) + wordCount - 1) / wordCount; byteOffset += complexOffset + (((complexOffset % sizeof(int)) & 1) == 1 ? 1 : 0); var queueEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += (power / 2 + 1) * sizeof(ushort); var queueOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += (power / 2 + 1) * sizeof(ushort); var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); #endregion ushort nextDistance; ushort vertexAfterTransitionA, vertexAfterTransitionB; uint vertexAfterTransition; var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } var index = acBegin * n; var maskN = (1 << n) - 1; for (int ac = acBegin; ac < acEnd; ac++, index += n) { // cleanup for (int consideringVertex = threadIdx.x, endingVertex = (power - 1) / wordCount; consideringVertex < endingVertex; consideringVertex += blockDim.x) { isDiscoveredPtr[consideringVertex] = 0; } if (threadIdx.x < n) { gpuAB[threadIdx.x + 1] = (uint)( (1 << (n + precomputedStateTransitioningMatrixA[index + threadIdx.x])) | (1 << precomputedStateTransitioningMatrixB[index + threadIdx.x]) ); } else if (threadIdx.x == n) { gpuAB[0] = 0; readingQueueIndex[0] = 0; shouldStop[0] = false; queueEvenCount[0] = 0; queueOddCount[0] = 1; queueOdd[0] = (ushort)(power - 1); // assuming n >= 2 isDiscoveredPtr[(power - 1) / wordCount] = 1 << (((power - 1) % wordCount) * bitSize); } var readingQueue = queueOdd; var writingQueue = queueEven; var readingQueueCount = queueOddCount; var writingQueueIndex = queueEvenCount; nextDistance = 1; int readingQueueCountCached = 1; DeviceFunction.SyncThreads(); while (readingQueueCountCached > 0 && !shouldStop[0]) { //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac); while (readingQueueIndex[0] < readingQueueCountCached) { var iter = DeviceFunction.AtomicAdd(readingQueueIndex, 1); if (iter >= readingQueueCountCached) { break; } int consideringVertex = readingQueue[iter]; vertexAfterTransition = 0; for (int i = 1; i <= n; i++, consideringVertex >>= 1) { vertexAfterTransition |= gpuAB[i * (1 & consideringVertex)]; } vertexAfterTransitionA = (ushort)(vertexAfterTransition >> n); vertexAfterTransitionB = (ushort)(vertexAfterTransition & maskN); var isDiscoveredOffset = (vertexAfterTransitionA % wordCount) * bitSize; if (0 == (isDiscoveredPtr[vertexAfterTransitionA / wordCount] & (twoToBitsize << isDiscoveredOffset))) { // should have used AtomicOr (which is not available in AleaGPU - very unfortunate) var beforeAdded = DeviceFunction.AtomicAdd( isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount), 1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset); if (0 == beforeAdded) { if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)] = (ushort)vertexAfterTransitionA; } else { DeviceFunction.AtomicSub( isDiscoveredPtr.Ptr(vertexAfterTransitionA / wordCount), 1 << isDiscoveredOffset); } } isDiscoveredOffset = (vertexAfterTransitionB % wordCount) * bitSize; if (0 == (isDiscoveredPtr[vertexAfterTransitionB / wordCount] & (twoToBitsize << isDiscoveredOffset))) { var beforeAdded = DeviceFunction.AtomicAdd( isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount), 1 << isDiscoveredOffset) & (twoToBitsize << isDiscoveredOffset); if (0 == beforeAdded) { if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingQueue[DeviceFunction.AtomicAdd(writingQueueIndex, 1)] = (ushort)vertexAfterTransitionB; } else { DeviceFunction.AtomicSub( isDiscoveredPtr.Ptr(vertexAfterTransitionB / wordCount), 1 << isDiscoveredOffset); } } } DeviceFunction.SyncThreads(); ++nextDistance; readingQueue = nextDistance % 2 == 0 ? queueEven : queueOdd; writingQueue = nextDistance % 2 != 0 ? queueEven : queueOdd; readingQueueCount = nextDistance % 2 == 0 ? queueEvenCount : queueOddCount; writingQueueIndex = nextDistance % 2 != 0 ? queueEvenCount : queueOddCount; readingQueueCountCached = nextDistance % 2 == 0 ? queueEvenCount[0] : queueOddCount[0]; if (threadIdx.x == 0) { writingQueueIndex[0] = 0; readingQueueIndex[0] = 0; } DeviceFunction.SyncThreads(); } } }
public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] isSynchronizing, int[] shortestSynchronizingWordLength) { var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; #region Pointer setup var byteOffset = 0; var minEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var minOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var maxEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var maxOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <int>()) .Ptr(byteOffset / sizeof(int)); byteOffset += blockDim.x * sizeof(int); var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var isActiveEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var isActiveOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var isDiscovered = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)); byteOffset += power * sizeof(bool); var addedAnythingOdd = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); var addedAnythingEven = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); var shouldStop = DeviceFunction.AddressOfArray(__shared__.ExternArray <bool>()) .Ptr(byteOffset / sizeof(bool)) .Volatile(); byteOffset += sizeof(bool); #endregion ushort nextDistance; int vertexAfterTransitionA, vertexAfterTransitionB, index; var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } index = acBegin * n; var threadWork = (power + blockDim.x - 1) / blockDim.x; DeviceFunction.SyncThreads(); for (int ac = acBegin; ac < acEnd; ac++, index += n) { //Console.WriteLine("Begin {0}", threadIdx.x); // cleanup var readingActive = isActiveOdd; var writingActive = isActiveEven; var readingAnythingAdded = addedAnythingOdd; var writingAnythingAdded = addedAnythingEven; var minRead = minOdd; var minWrite = minEven; var maxRead = maxOdd; var maxWrite = maxEven; if (threadIdx.x == DeviceFunction.WarpSize || (threadIdx.x == 0 && blockDim.x <= DeviceFunction.WarpSize)) { for (int i = 0; i < n; i++) { gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]); gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]); } } minRead[threadIdx.x] = int.MaxValue; maxRead[threadIdx.x] = 0; minWrite[threadIdx.x] = int.MaxValue; maxWrite[threadIdx.x] = 0; if (threadIdx.x == 0) { shouldStop[0] = false; readingAnythingAdded[0] = true; } else if (threadIdx.x == blockDim.x - 1) { minRead[blockDim.x - 1] = (power - 1) % threadWork; maxRead[blockDim.x - 1] = power % threadWork; } nextDistance = 1; for (int consideringVertex = threadIdx.x, endingVertex = power, powerm1 = power - 1; consideringVertex < endingVertex; consideringVertex += blockDim.x) { isActiveEven[consideringVertex] = false; isActiveOdd[consideringVertex] = consideringVertex == powerm1; isDiscovered[consideringVertex] = consideringVertex == powerm1; //Console.WriteLine("cleaning {0}, {1} {2} {3}", consideringVertex, // readingActive[consideringVertex], writingActive[consideringVertex], isDiscovered[consideringVertex]); } DeviceFunction.SyncThreads(); while (readingAnythingAdded[0] && !shouldStop[0]) { if (threadIdx.x == 0) { readingAnythingAdded[0] = false; writingAnythingAdded[0] = false; //Console.WriteLine("distance {0}", nextDistance); } int myPart = (power + blockDim.x - 1) / blockDim.x; int beginningPointer = threadIdx.x * myPart; int endingPointer = beginningPointer + myPart; if (power < endingPointer) { endingPointer = power; } if (minRead[threadIdx.x] > beginningPointer) { beginningPointer = minRead[threadIdx.x]; } //if (maxRead[threadIdx.x] < endingPointer) // endingPointer = maxRead[threadIdx.x]; minRead[threadIdx.x] = int.MaxValue; maxRead[threadIdx.x] = 0; minWrite[threadIdx.x] = int.MaxValue; maxWrite[threadIdx.x] = 0; DeviceFunction.SyncThreads(); //Console.WriteLine("ac {3}, threadix {0}, begin {1}, end {2}", threadIdx.x, beginningPointer, endingPointer, ac); for (int consideringVertex = beginningPointer; consideringVertex < endingPointer; ++consideringVertex) { //Console.WriteLine("writeIsActive[{0}]=={1}", consideringVertex, writingActive[consideringVertex]); if (!readingActive[consideringVertex]) { //Console.WriteLine("Skipping {0}", consideringVertex); continue; } else { //Console.WriteLine("Considering {0} dist {1}", consideringVertex, nextDistance); readingActive[consideringVertex] = false; } vertexAfterTransitionA = vertexAfterTransitionB = 0; for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransitionA |= gpuA[i]; vertexAfterTransitionB |= gpuB[i]; } } if (!isDiscovered[vertexAfterTransitionA]) { isDiscovered[vertexAfterTransitionA] = true; //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionA, consideringVertex); DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionA / threadWork), vertexAfterTransitionA % threadWork); DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionA / threadWork), 1 + (vertexAfterTransitionA % threadWork)); if (0 == (vertexAfterTransitionA & (vertexAfterTransitionA - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingActive[vertexAfterTransitionA] = true; writingAnythingAdded[0] = true; } if (!isDiscovered[vertexAfterTransitionB]) { isDiscovered[vertexAfterTransitionB] = true; //Console.WriteLine("Discovered {0} by {1}", vertexAfterTransitionB, consideringVertex); DeviceFunction.AtomicMin(minWrite.Ptr(vertexAfterTransitionB / threadWork), vertexAfterTransitionB % threadWork); DeviceFunction.AtomicMax(maxWrite.Ptr(vertexAfterTransitionB / threadWork), 1 + (vertexAfterTransitionB % threadWork)); if (0 == (vertexAfterTransitionB & (vertexAfterTransitionB - 1))) { shortestSynchronizingWordLength[ac] = nextDistance; isSynchronizing[ac] = true; shouldStop[0] = true; break; } writingActive[vertexAfterTransitionB] = true; writingAnythingAdded[0] = true; } } ++nextDistance; readingActive = nextDistance % 2 == 0 ? isActiveEven : isActiveOdd; writingActive = nextDistance % 2 != 0 ? isActiveEven : isActiveOdd; readingAnythingAdded = nextDistance % 2 == 0 ? addedAnythingEven : addedAnythingOdd; writingAnythingAdded = nextDistance % 2 != 0 ? addedAnythingEven : addedAnythingOdd; minRead = nextDistance % 2 == 0 ? minEven : minOdd; minWrite = nextDistance % 2 != 0 ? minEven : minOdd; maxRead = nextDistance % 2 == 0 ? maxEven : maxOdd; maxWrite = nextDistance % 2 != 0 ? maxEven : maxOdd; DeviceFunction.SyncThreads(); } } }