/// <summary> /// The Run method provides the main 'actor' loop that performs the following steps: /// 1.) get state /// 2.) build experience /// 3.) create policy gradients /// 4.) train on experiences /// </summary> /// <param name="phase">Specifies the phae.</param> /// <param name="nIterations">Specifies the number of iterations to run.</param> public void Run(Phase phase, int nIterations) { MemoryCollection m_rgMemory = new MemoryCollection(); double? dfRunningReward = null; double dfRewardSum = 0; int nEpisodeNumber = 0; int nIteration = 0; StateBase s = getData(-1); while (!m_brain.Cancel.WaitOne(0) && (nIterations == -1 || nIteration < nIterations)) { // Preprocess the observation. SimpleDatum x = m_brain.Preprocess(s, m_bUseRawInput); // Forward the policy network and sample an action. float fAprob; int action = m_brain.act(x, out fAprob); // Take the next step using the action StateBase s_ = getData(action); dfRewardSum += s_.Reward; if (phase == Phase.TRAIN) { // Build up episode memory, using reward for taking the action. m_rgMemory.Add(new MemoryItem(s, x, action, fAprob, (float)s_.Reward)); // An episode has finished. if (s_.Done) { nEpisodeNumber++; nIteration++; m_brain.Reshape(m_rgMemory); // Compute the discounted reward (backwards through time) float[] rgDiscountedR = m_rgMemory.GetDiscountedRewards(m_fGamma, m_bAllowDiscountReset); // Rewards are standardized when set to be unit normal (helps control the gradient estimator variance) m_brain.SetDiscountedR(rgDiscountedR); // Modulate the gradient with the advantage (PG magic happens right here.) float[] rgDlogp = m_rgMemory.GetPolicyGradients(); // discounted R applied to policy agradient within loss function, just before the backward pass. m_brain.SetPolicyGradients(rgDlogp); // Train for one iteration, which triggers the loss function. List <Datum> rgData = m_rgMemory.GetData(); m_brain.SetData(rgData); m_brain.Train(nEpisodeNumber); // Update reward running if (!dfRunningReward.HasValue) { dfRunningReward = dfRewardSum; } else { dfRunningReward = dfRunningReward.Value * 0.99 + dfRewardSum * 0.01; } updateStatus(nEpisodeNumber, dfRewardSum, dfRunningReward.Value); dfRewardSum = 0; s = getData(-1); m_rgMemory.Clear(); } else { s = s_; } } else { if (s_.Done) { nEpisodeNumber++; // Update reward running if (!dfRunningReward.HasValue) { dfRunningReward = dfRewardSum; } else { dfRunningReward = dfRunningReward.Value * 0.99 + dfRewardSum * 0.01; } updateStatus(nEpisodeNumber, dfRewardSum, dfRunningReward.Value); dfRewardSum = 0; s = getData(-1); } else { s = s_; } nIteration++; } } }
/// <summary> /// The Run method provides the main 'actor' loop that performs the following steps: /// 1.) get state /// 2.) build experience /// 3.) create policy gradients /// 4.) train on experiences /// </summary> /// <param name="phase">Specifies the phae.</param> /// <param name="nN">Specifies the number of iterations (based on the ITERATION_TYPE) to run, or -1 to ignore.</param> /// <param name="type">Specifies the iteration type (default = ITERATION).</param> public void Run(Phase phase, int nN, ITERATOR_TYPE type) { MemoryCollection m_rgMemory = new MemoryCollection(); double? dfRunningReward = null; double dfEpisodeReward = 0; int nEpisode = 0; int nIteration = 0; StateBase s = getData(phase, -1); if (s.Clip != null) { throw new Exception("The PG.SIMPLE trainer does not support recurrent layers or clip data, use the 'PG.ST' or 'PG.MT' trainer instead."); } while (!m_brain.Cancel.WaitOne(0) && !isAtIteration(nN, type, nIteration, nEpisode)) { // Preprocess the observation. SimpleDatum x = m_brain.Preprocess(s, m_bUseRawInput); // Forward the policy network and sample an action. float fAprob; int action = m_brain.act(x, out fAprob); // Take the next step using the action StateBase s_ = getData(phase, action); dfEpisodeReward += s_.Reward; if (phase == Phase.TRAIN) { // Build up episode memory, using reward for taking the action. m_rgMemory.Add(new MemoryItem(s, x, action, fAprob, (float)s_.Reward)); // An episode has finished. if (s_.Done) { nEpisode++; nIteration++; m_brain.Reshape(m_rgMemory); // Compute the discounted reward (backwards through time) float[] rgDiscountedR = m_rgMemory.GetDiscountedRewards(m_fGamma, m_bAllowDiscountReset); // Rewards are standardized when set to be unit normal (helps control the gradient estimator variance) m_brain.SetDiscountedR(rgDiscountedR); // Modulate the gradient with the advantage (PG magic happens right here.) float[] rgDlogp = m_rgMemory.GetPolicyGradients(); // discounted R applied to policy gradient within loss function, just before the backward pass. m_brain.SetPolicyGradients(rgDlogp); // Train for one iteration, which triggers the loss function. List <Datum> rgData = m_rgMemory.GetData(); m_brain.SetData(rgData); m_brain.Train(nIteration); // Update reward running if (!dfRunningReward.HasValue) { dfRunningReward = dfEpisodeReward; } else { dfRunningReward = dfRunningReward * 0.99 + dfEpisodeReward * 0.01; } updateStatus(nIteration, nEpisode, dfEpisodeReward, dfRunningReward.Value); dfEpisodeReward = 0; s = getData(phase, -1); m_rgMemory.Clear(); } else { s = s_; } } else { if (s_.Done) { nEpisode++; // Update reward running if (!dfRunningReward.HasValue) { dfRunningReward = dfEpisodeReward; } else { dfRunningReward = dfRunningReward * 0.99 + dfEpisodeReward * 0.01; } updateStatus(nIteration, nEpisode, dfEpisodeReward, dfRunningReward.Value); dfEpisodeReward = 0; s = getData(phase, -1); } else { s = s_; } nIteration++; } } }