Example #1
0
    /// <summary>
    /// Update
    /// </summary>
    private void Update()
    {
        if (Input.GetKeyDown(KeyCode.Return))
        {
            float[][] sample = GenerateSample();
            float[]   result = nn.FeedForward(sample[0]);

            // Print result
            Debug.Log(
                $"Input: {System.Math.Round(sample[0][0], 1)} {System.Math.Round(sample[0][1], 2)}, " +
                $"Output: {System.Math.Round(result[0], 2)}, " +
                $"Expected Output: {System.Math.Round(sample[1][0], 2)}");
        }
    }
Example #2
0
        public void Learn(bool terminalState)
        {
            if (!replayMemory.CanGetSample)
            {
                return;
            }

            // State, Action, Reward, New State, Done
            List <Tuple <float[], int, float, float[], bool> > sample = replayMemory.GetSample();

            List <float[]> trainingInput  = new List <float[]>();
            List <float[]> trainingOutput = new List <float[]>();

            List <float[]> currentQsList = new List <float[]>();
            List <float[]> futureQsList  = new List <float[]>();

            for (int i = 0; i < sample.Count; i++)
            {
                // Split sample
                var     currentSample = sample[i];
                float[] state         = currentSample.Item1;
                int     action        = currentSample.Item2;
                float   reward        = currentSample.Item3;
                float[] newState      = currentSample.Item4;
                bool    done          = currentSample.Item5;

                // Predict Q values from current states
                currentQsList.Add(policyNet.FeedForward(state));
                // Predict Q values from future states
                futureQsList.Add(targetNet.FeedForward(newState));

                // If not a terminal state, get new q from future states, otherwise set it to 0
                // almost like with Q Learning, but we use just part of equation here
                float newQ;
                if (!done)
                {
                    float maxFutureQ = GetBestAction(futureQsList.Last());
                    newQ = reward + DiscountFactor * maxFutureQ;
                }
                else
                {
                    newQ = reward;
                }

                float[] currentQs = currentQsList[i];
                currentQs[action] = newQ;

                trainingInput.Add(state);
                trainingOutput.Add(currentQs);
            }

            policyNetTrainer.TrainBatchesOnce(trainingInput.ToArray(), trainingOutput.ToArray());

            if (terminalState)
            {
                targetUpateCounter++;
            }

            if (targetUpateCounter >= updateTargetNetEvery)
            {
                UpdateTargetNet();
                targetUpateCounter = 0;
            }
        }