//Gives you the max Q-Value possible from a determined state (cell). private double CalculateEstimate(int nextCell) { double estimate = 0; if (!LabryntRules.IsFinalState(nextCell)) { Movement bestMove = GetBestMovement(nextCell); estimate = qMat.GetQValue(nextCell, bestMove); } return(estimate); }
// Update is called once per frame void Update() { if (!paused) { timer += Time.deltaTime; float dt = speedSelector.GetInverse(); if (timer > dt) { //Calculate how many choices we need according to the speed selector. int loops = (int)(timer / dt); while (loops > 0) { if (restart) { InitialiseAlgorithm(); restart = false; newEpisode = true; } if (newEpisode) { SetValuesForNewEpisode(); newEpisode = false; } Movement currentMove; //Choose action (a) based on policy (p) if (ShouldExplore()) { currentMove = GetRandomMovement(currentCell); } else { currentMove = GetBestMovement(currentCell); } //Observe reward (r) int reward = LabryntRules.GetReward(currentCell, currentMove); double oldQValue = qMat.GetQValue(currentCell, currentMove); //Do an estimation of the best Q-value to obtain from next state. int nextCell = LabryntRules.GetLandingCell(currentCell, currentMove); double estimation = CalculateEstimate(nextCell); //Recalculate Q-value for the current cell based on estimation. double updatedQValue = CalculateNewQValue(oldQValue, alpha, gamma, reward, estimation); qMat.SetQValue(currentCell, currentMove, updatedQValue); cumulativeReward += reward; currentCell = nextCell; UpdatePlayerPositionInDisplay(); //Now we update the display with the new player position. //If the player landed in an final state cell, we need to update epsilon and start a new episode. if (LabryntRules.IsFinalState(currentCell)) { //Change epsilon parameter so it shifts from exploration to explotation gradually between episodes. if (epsilon > 0.3) { epsilon *= epsilonDecay1; } if (epsilon < 0.3) { epsilon *= epsilonDecay2; } newEpisode = true; } loops--; } timer -= dt; } } }