public static void Kernel( int[] precomputedStateTransitioningMatrixA, int[] precomputedStateTransitioningMatrixB, bool[] statusOfSynchronization) { // the status might be YES, NO and DUNNO (aleaGPU enum???) // TODO: change this Kernel and computation! var n = problemSize.Value; var arrayCount = precomputedStateTransitioningMatrixA.Length / n; var power = 1 << n; #region Pointer setup var byteOffset = 0; var gpuA = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); var gpuB = DeviceFunction.AddressOfArray(__shared__.ExternArray <ushort>()) .Ptr(byteOffset / sizeof(ushort)) .Volatile(); byteOffset += n * sizeof(ushort); #endregion var acPart = (arrayCount + gridDim.x - 1) / gridDim.x; var acBegin = blockIdx.x * acPart; var acEnd = acBegin + acPart; if (arrayCount < acEnd) { acEnd = arrayCount; } var index = acBegin * n; for (int ac = acBegin; ac < acEnd; ac++, index += n) { DeviceFunction.SyncThreads(); if (threadIdx.x == 0) { for (int i = 0; i < n; i++) { gpuA[i] = (ushort)(1 << precomputedStateTransitioningMatrixA[index + i]); gpuB[i] = (ushort)(1 << precomputedStateTransitioningMatrixB[index + i]); } } var pathMask = threadIdx.x; int vertexAfterTransition; var consideringVertex = power - 1; DeviceFunction.SyncThreads(); for (int iter = 0; iter < 9; iter++, pathMask >>= 1) { vertexAfterTransition = 0; if ((pathMask & 1) == 0) { for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransition |= gpuA[i]; } } } else { for (int i = 0, mask = 1; i < n; i++, mask <<= 1) { if (0 != (mask & consideringVertex)) { vertexAfterTransition |= gpuB[i]; } } } consideringVertex = vertexAfterTransition; } var singleVertex = DeviceFunction.Any(0 == (consideringVertex & (consideringVertex - 1))); if (singleVertex && threadIdx.x % DeviceFunction.WarpSize == 0) { statusOfSynchronization[ac] = true; } } }