/// <summary>
 /// It loads the TestMode, NumberOfIterations parameters and also the PRNG seed from the SimpleMemory at
 /// the beginning.
 /// </summary>
 /// <param name="memory"></param>
 public void InitializeParametersFromMemory(SimpleMemory memory)
 {
     Prng1 = new PrngMWC64X((((ulong)memory.ReadUInt32(MemIndexRandomStates)) << 32) |
                            memory.ReadUInt32(MemIndexRandomStates + 1));
     Prng2 = new PrngMWC64X((((ulong)memory.ReadUInt32(MemIndexRandomStates + 2)) << 32) |
                            memory.ReadUInt32(MemIndexRandomStates + 3));
     TestMode           = (memory.ReadUInt32(MemIndexStepMode) & 1) == 1;
     NumberOfIterations = memory.ReadUInt32(MemIndexNumberOfIterations);
 }
        public virtual void ScheduleIterations(SimpleMemory memory)
        {
            int       numberOfIterations    = memory.ReadInt32(KpzKernelsParallelizedInterface.MemIndexNumberOfIterations);
            const int TasksPerIteration     = (GridSize * GridSize) / (LocalGridSize * LocalGridSize);
            const int SchedulesPerIteration = TasksPerIteration / ParallelTasks;
            int       iterationGroupSize    = numberOfIterations * ReschedulesPerTaskIteration;
            const int PokesInsideTask       = LocalGridSize * LocalGridSize / ReschedulesPerTaskIteration;
            const int LocalGridPartitions   = GridSize / LocalGridSize;
            //Note: TotalNumberOfTasks = TasksPerIteration * NumberOfIterations ==
            //  ((GridSize * GridSize) / (LocalGridSize * LocalGridSize)) * NumberOfIterations
            int  parallelTaskRandomIndex = 0;
            uint randomSeedTemp;
            var  prng0 = new PrngMWC64X();

            var taskLocals = new KpzKernelsTaskState[ParallelTasks];

            for (int TaskLocalsIndex = 0; TaskLocalsIndex < ParallelTasks; TaskLocalsIndex++)
            {
                taskLocals[TaskLocalsIndex] = new KpzKernelsTaskState
                {
                    bramDx = new bool[LocalGridSize * LocalGridSize],
                    bramDy = new bool[LocalGridSize * LocalGridSize],
                    prng1  = new PrngMWC64X
                    {
                        state = memory.ReadUInt32(MemIndexRandomSeed + parallelTaskRandomIndex++)
                    }
                };
                randomSeedTemp = memory.ReadUInt32(MemIndexRandomSeed + parallelTaskRandomIndex++);
                taskLocals[TaskLocalsIndex].prng1.state |= ((ulong)randomSeedTemp) << 32;

                taskLocals[TaskLocalsIndex].prng2 = new PrngMWC64X
                {
                    state = memory.ReadUInt32(MemIndexRandomSeed + parallelTaskRandomIndex++)
                };
                randomSeedTemp = memory.ReadUInt32(MemIndexRandomSeed + parallelTaskRandomIndex++);
                taskLocals[TaskLocalsIndex].prng2.state |= ((ulong)randomSeedTemp) << 32;
            }

            // What is iterationGroupIndex good for?
            // IterationPerTask needs to be between 0.5 and 1 based on the e-mail of Mate.
            // If we want 10 iterations, and starting a full series of tasks makes half iteration on the full table,
            // then we need to start it 20 times (thus IterationGroupSize will be 20).

            prng0.state    = memory.ReadUInt32(MemIndexRandomSeed + parallelTaskRandomIndex++);
            randomSeedTemp = memory.ReadUInt32(MemIndexRandomSeed + parallelTaskRandomIndex++);
            prng0.state   |= ((ulong)randomSeedTemp) << 32;

            for (int iterationGroupIndex = 0; iterationGroupIndex < iterationGroupSize; iterationGroupIndex++)
            {
                uint randomValue0 = prng0.NextUInt32();
                // This assumes that LocalGridSize is 2^N:
                int randomXOffset = (int)((LocalGridSize - 1) & randomValue0);
                int randomYOffset = (int)((LocalGridSize - 1) & (randomValue0 >> 16));
                for (int scheduleIndex = 0; scheduleIndex < SchedulesPerIteration; scheduleIndex++)
                {
                    var tasks = new Task <KpzKernelsTaskState> [ParallelTasks];
                    for (int parallelTaskIndex = 0; parallelTaskIndex < ParallelTasks; parallelTaskIndex++)
                    {
                        // Decide the X and Y starting coordinates based on ScheduleIndex and ParallelTaskIndex
                        // (and the random added value)
                        int localGridIndex = parallelTaskIndex + scheduleIndex * ParallelTasks;
                        // The X and Y coordinate within the small table (local grid):
                        int partitionX = localGridIndex % LocalGridPartitions;
                        int partitionY = localGridIndex / LocalGridPartitions;
                        // The X and Y coordinate within the big table (grid):
                        int baseX = partitionX * LocalGridSize + randomXOffset;
                        int baseY = partitionY * LocalGridSize + randomYOffset;

                        // Copy to local memory
                        for (int copyDstX = 0; copyDstX < LocalGridSize; copyDstX++)
                        {
                            for (int CopyDstY = 0; CopyDstY < LocalGridSize; CopyDstY++)
                            {
                                //Prevent going out of grid memory area (e.g. reading into random seed):
                                int  copySrcX = (baseX + copyDstX) % GridSize;
                                int  copySrcY = (baseY + CopyDstY) % GridSize;
                                uint value    = memory.ReadUInt32(MemIndexGrid + copySrcX + copySrcY * GridSize);
                                taskLocals[parallelTaskIndex].bramDx[copyDstX + CopyDstY * LocalGridSize] =
                                    (value & 1) == 1;
                                taskLocals[parallelTaskIndex].bramDy[copyDstX + CopyDstY * LocalGridSize] =
                                    (value & 2) == 2;
                            }
                        }

                        tasks[parallelTaskIndex] = Task.Factory.StartNew(
                            rawTaskState =>
                        {
                            // Then do TasksPerIteration iterations
                            var taskLocal = (KpzKernelsTaskState)rawTaskState;
                            for (int pokeIndex = 0; pokeIndex < PokesInsideTask; pokeIndex++)
                            {
                                // ==== <Now randomly switch four cells> ====

                                // Generating two random numbers:
                                uint taskRandomNumber1 = taskLocal.prng1.NextUInt32();
                                uint taskRandomNumber2 = taskLocal.prng2.NextUInt32();

                                // The existence of var-1 in code is a good indicator of that it is assumed to be 2^N:
                                int pokeCenterX      = (int)(taskRandomNumber1 & (LocalGridSize - 1));
                                int pokeCenterY      = (int)((taskRandomNumber1 >> 16) & (LocalGridSize - 1));
                                int pokeCenterIndex  = pokeCenterX + pokeCenterY * LocalGridSize;
                                uint randomVariable1 = taskRandomNumber2 & ((1 << 16) - 1);
                                uint randomVariable2 = (taskRandomNumber2 >> 16) & ((1 << 16) - 1);

                                // get neighbour indexes:
                                int rightNeighbourIndex;
                                int bottomNeighbourIndex;
                                // We skip if neighbours would fall out of the local grid:
                                if (pokeCenterX >= LocalGridSize - 1 || pokeCenterY >= LocalGridSize - 1)
                                {
                                    continue;
                                }
                                int rightNeighbourX  = pokeCenterX + 1;
                                int rightNeighbourY  = pokeCenterY;
                                int bottomNeighbourX = pokeCenterX;
                                int bottomNeighbourY = pokeCenterY + 1;
                                rightNeighbourIndex  = rightNeighbourY * LocalGridSize + rightNeighbourX;
                                bottomNeighbourIndex = bottomNeighbourY * LocalGridSize + bottomNeighbourX;

                                // We check our own {dx,dy} values, and the right neighbour's dx, and bottom neighbour's dx.

                                if (
                                    // If we get the pattern {01, 01} we have a pyramid:
                                    ((taskLocal.bramDx[pokeCenterIndex] && !taskLocal.bramDx[rightNeighbourIndex]) &&
                                     (taskLocal.bramDy[pokeCenterIndex] && !taskLocal.bramDy[bottomNeighbourIndex]) &&
                                     (false || randomVariable1 < IntegerProbabilityP)) ||
                                    // If we get the pattern {10, 10} we have a hole:
                                    ((!taskLocal.bramDx[pokeCenterIndex] && taskLocal.bramDx[rightNeighbourIndex]) &&
                                     (!taskLocal.bramDy[pokeCenterIndex] && taskLocal.bramDy[bottomNeighbourIndex]) &&
                                     (false || randomVariable2 < IntegerProbabilityQ))
                                    )
                                {
                                    // We make a hole into a pyramid, and a pyramid into a hole.
                                    taskLocal.bramDx[pokeCenterIndex]      = !taskLocal.bramDx[pokeCenterIndex];
                                    taskLocal.bramDy[pokeCenterIndex]      = !taskLocal.bramDy[pokeCenterIndex];
                                    taskLocal.bramDx[rightNeighbourIndex]  = !taskLocal.bramDx[rightNeighbourIndex];
                                    taskLocal.bramDy[bottomNeighbourIndex] = !taskLocal.bramDy[bottomNeighbourIndex];
                                }

                                // ==== </Now randomly switch four cells> ====
                            }
                            return(taskLocal);
                        }, taskLocals[parallelTaskIndex]);
                    }

                    Task.WhenAll(tasks).Wait();

                    // Copy back to SimpleMemory
                    for (int parallelTaskIndex = 0; parallelTaskIndex < ParallelTasks; parallelTaskIndex++)
                    {
                        // Calculate these things again
                        int localGridIndex = parallelTaskIndex + scheduleIndex * ParallelTasks;
                        // The X and Y coordinate within the small table (local grid):
                        int partitionX = localGridIndex % LocalGridPartitions;
                        int partitionY = localGridIndex / LocalGridPartitions;
                        // The X and Y coordinate within the big table (grid):
                        int baseX = partitionX * LocalGridSize + randomXOffset;
                        int baseY = partitionY * LocalGridSize + randomYOffset;
                        //Console.WriteLine("CopyBack | Task={0}, To: {1},{2}", ParallelTaskIndex, BaseX, BaseY);

                        for (int copySrcX = 0; copySrcX < LocalGridSize; copySrcX++)
                        {
                            for (int copySrcY = 0; copySrcY < LocalGridSize; copySrcY++)
                            {
                                int  copyDstX = (baseX + copySrcX) % GridSize;
                                int  copyDstY = (baseY + copySrcY) % GridSize;
                                uint value    =
                                    (tasks[parallelTaskIndex].Result.bramDx[copySrcX + copySrcY * LocalGridSize] ? 1U : 0U) |
                                    (tasks[parallelTaskIndex].Result.bramDy[copySrcX + copySrcY * LocalGridSize] ? 2U : 0U);
                                //Note: use (tasks[parallelTaskIndex].Result), because
                                //    (TaskLocals[ParallelTaskIndex]) won't work.
                                memory.WriteUInt32(MemIndexGrid + copyDstX + copyDstY * GridSize, value);
                            }
                        }

                        // Take PRNG current state from Result to feed it to input next time
                        taskLocals[parallelTaskIndex].prng1.state = tasks[parallelTaskIndex].Result.prng1.state;
                        taskLocals[parallelTaskIndex].prng2.state = tasks[parallelTaskIndex].Result.prng2.state;
                    }
                }
            }
        }