/// <summary> /// Update /// </summary> private void Update() { if (Input.GetKeyDown(KeyCode.Return)) { float[][] sample = GenerateSample(); float[] result = nn.FeedForward(sample[0]); // Print result Debug.Log( $"Input: {System.Math.Round(sample[0][0], 1)} {System.Math.Round(sample[0][1], 2)}, " + $"Output: {System.Math.Round(result[0], 2)}, " + $"Expected Output: {System.Math.Round(sample[1][0], 2)}"); } }
public void Learn(bool terminalState) { if (!replayMemory.CanGetSample) { return; } // State, Action, Reward, New State, Done List <Tuple <float[], int, float, float[], bool> > sample = replayMemory.GetSample(); List <float[]> trainingInput = new List <float[]>(); List <float[]> trainingOutput = new List <float[]>(); List <float[]> currentQsList = new List <float[]>(); List <float[]> futureQsList = new List <float[]>(); for (int i = 0; i < sample.Count; i++) { // Split sample var currentSample = sample[i]; float[] state = currentSample.Item1; int action = currentSample.Item2; float reward = currentSample.Item3; float[] newState = currentSample.Item4; bool done = currentSample.Item5; // Predict Q values from current states currentQsList.Add(policyNet.FeedForward(state)); // Predict Q values from future states futureQsList.Add(targetNet.FeedForward(newState)); // If not a terminal state, get new q from future states, otherwise set it to 0 // almost like with Q Learning, but we use just part of equation here float newQ; if (!done) { float maxFutureQ = GetBestAction(futureQsList.Last()); newQ = reward + DiscountFactor * maxFutureQ; } else { newQ = reward; } float[] currentQs = currentQsList[i]; currentQs[action] = newQ; trainingInput.Add(state); trainingOutput.Add(currentQs); } policyNetTrainer.TrainBatchesOnce(trainingInput.ToArray(), trainingOutput.ToArray()); if (terminalState) { targetUpateCounter++; } if (targetUpateCounter >= updateTargetNetEvery) { UpdateTargetNet(); targetUpateCounter = 0; } }