public int Reshape(MemoryCollection col) { int nNum = col.Count; int nChannels = col[0].Data.Channels; int nHeight = col[0].Data.Height; int nWidth = col[0].Data.Height; int nActionProbs = 1; int nFound = 0; for (int i = 0; i < m_net.output_blobs.Count; i++) { if (m_net.output_blobs[i].type != Blob <T> .BLOB_TYPE.LOSS) { int nCh = m_net.output_blobs[i].channels; nActionProbs = Math.Max(nCh, nActionProbs); nFound++; } } if (nFound == 0) { throw new Exception("Could not find a non-loss output! Your model should output the loss and the action probabilities."); } m_blobDiscountedR.Reshape(nNum, nActionProbs, 1, 1); m_blobPolicyGradient.Reshape(nNum, nActionProbs, 1, 1); m_blobActionOneHot.Reshape(nNum, nActionProbs, 1, 1); m_blobDiscountedR1.Reshape(nNum, nActionProbs, 1, 1); m_blobPolicyGradient1.Reshape(nNum, nActionProbs, 1, 1); m_blobActionOneHot1.Reshape(nNum, nActionProbs, 1, 1); m_blobLoss.Reshape(1, 1, 1, 1); return(nActionProbs); }
/// <summary> /// The Run method provides the main 'actor' loop that performs the following steps: /// 1.) get state /// 2.) build experience /// 3.) create policy gradients /// 4.) train on experiences /// </summary> /// <param name="phase">Specifies the phae.</param> /// <param name="nN">Specifies the number of iterations (based on the ITERATION_TYPE) to run, or -1 to ignore.</param> /// <param name="type">Specifies the iteration type (default = ITERATION).</param> /// <param name="step">Specifies the training step to take, if any. This is only used when debugging.</param> public void Run(Phase phase, int nN, ITERATOR_TYPE type, TRAIN_STEP step) { MemoryCollection m_rgMemory = new MemoryCollection(); double? dfRunningReward = null; double dfEpisodeReward = 0; int nEpisode = 0; int nIteration = 0; StateBase s = getData(phase, -1); while (!m_brain.Cancel.WaitOne(0) && !isAtIteration(nN, type, nIteration, nEpisode)) { // Preprocess the observation. SimpleDatum x = m_brain.Preprocess(s, m_bUseRawInput); // Forward the policy network and sample an action. float[] rgfAprob; int action = m_brain.act(x, s.Clip, out rgfAprob); if (step == TRAIN_STEP.FORWARD) { return; } // Take the next step using the action StateBase s_ = getData(phase, action); dfEpisodeReward += s_.Reward; if (phase == Phase.TRAIN) { // Build up episode memory, using reward for taking the action. m_rgMemory.Add(new MemoryItem(s, x, action, rgfAprob, (float)s_.Reward)); // An episode has finished. if (s_.Done) { nEpisode++; nIteration++; m_brain.Reshape(m_rgMemory); // Compute the discounted reward (backwards through time) float[] rgDiscountedR = m_rgMemory.GetDiscountedRewards(m_fGamma, m_bAllowDiscountReset); // Rewards are standardized when set to be unit normal (helps control the gradient estimator variance) m_brain.SetDiscountedR(rgDiscountedR); // Get the action probabilities. float[] rgfAprobSet = m_rgMemory.GetActionProbabilities(); // The action probabilities are used to calculate the initial gradient within the loss function. m_brain.SetActionProbabilities(rgfAprobSet); // Get the action one-hot vectors. When using Softmax, this contains the one-hot vector containing // each action set (e.g. 3 actions with action 0 set would return a vector <1,0,0>). // When using a binary probability (e.g. with Sigmoid), the each action set only contains a // single element which is set to the action value itself (e.g. 0 for action '0' and 1 for action '1') float[] rgfAonehotSet = m_rgMemory.GetActionOneHotVectors(); m_brain.SetActionOneHotVectors(rgfAonehotSet); // Train for one iteration, which triggers the loss function. List <Datum> rgData = m_rgMemory.GetData(); List <Datum> rgClip = m_rgMemory.GetClip(); m_brain.SetData(rgData, rgClip); m_brain.Train(nIteration, step); // Update reward running if (!dfRunningReward.HasValue) { dfRunningReward = dfEpisodeReward; } else { dfRunningReward = dfRunningReward * 0.99 + dfEpisodeReward * 0.01; } updateStatus(nIteration, nEpisode, dfEpisodeReward, dfRunningReward.Value); dfEpisodeReward = 0; s = getData(phase, -1); m_rgMemory.Clear(); if (step != TRAIN_STEP.NONE) { return; } } else { s = s_; } } else { if (s_.Done) { nEpisode++; // Update reward running if (!dfRunningReward.HasValue) { dfRunningReward = dfEpisodeReward; } else { dfRunningReward = dfRunningReward * 0.99 + dfEpisodeReward * 0.01; } updateStatus(nIteration, nEpisode, dfEpisodeReward, dfRunningReward.Value); dfEpisodeReward = 0; s = getData(phase, -1); } else { s = s_; } nIteration++; } } }