Exemplo n.º 1
0
        public int Reshape(MemoryCollection col)
        {
            int nNum         = col.Count;
            int nChannels    = col[0].Data.Channels;
            int nHeight      = col[0].Data.Height;
            int nWidth       = col[0].Data.Height;
            int nActionProbs = 1;
            int nFound       = 0;

            for (int i = 0; i < m_net.output_blobs.Count; i++)
            {
                if (m_net.output_blobs[i].type != Blob <T> .BLOB_TYPE.LOSS)
                {
                    int nCh = m_net.output_blobs[i].channels;
                    nActionProbs = Math.Max(nCh, nActionProbs);
                    nFound++;
                }
            }

            if (nFound == 0)
            {
                throw new Exception("Could not find a non-loss output!  Your model should output the loss and the action probabilities.");
            }

            m_blobDiscountedR.Reshape(nNum, nActionProbs, 1, 1);
            m_blobPolicyGradient.Reshape(nNum, nActionProbs, 1, 1);
            m_blobActionOneHot.Reshape(nNum, nActionProbs, 1, 1);
            m_blobDiscountedR1.Reshape(nNum, nActionProbs, 1, 1);
            m_blobPolicyGradient1.Reshape(nNum, nActionProbs, 1, 1);
            m_blobActionOneHot1.Reshape(nNum, nActionProbs, 1, 1);
            m_blobLoss.Reshape(1, 1, 1, 1);

            return(nActionProbs);
        }
Exemplo n.º 2
0
        /// <summary>
        /// The Run method provides the main 'actor' loop that performs the following steps:
        /// 1.) get state
        /// 2.) build experience
        /// 3.) create policy gradients
        /// 4.) train on experiences
        /// </summary>
        /// <param name="phase">Specifies the phae.</param>
        /// <param name="nN">Specifies the number of iterations (based on the ITERATION_TYPE) to run, or -1 to ignore.</param>
        /// <param name="type">Specifies the iteration type (default = ITERATION).</param>
        /// <param name="step">Specifies the training step to take, if any.  This is only used when debugging.</param>
        public void Run(Phase phase, int nN, ITERATOR_TYPE type, TRAIN_STEP step)
        {
            MemoryCollection m_rgMemory      = new MemoryCollection();
            double?          dfRunningReward = null;
            double           dfEpisodeReward = 0;
            int nEpisode   = 0;
            int nIteration = 0;

            StateBase s = getData(phase, -1);

            while (!m_brain.Cancel.WaitOne(0) && !isAtIteration(nN, type, nIteration, nEpisode))
            {
                // Preprocess the observation.
                SimpleDatum x = m_brain.Preprocess(s, m_bUseRawInput);

                // Forward the policy network and sample an action.
                float[] rgfAprob;
                int     action = m_brain.act(x, s.Clip, out rgfAprob);

                if (step == TRAIN_STEP.FORWARD)
                {
                    return;
                }

                // Take the next step using the action
                StateBase s_ = getData(phase, action);
                dfEpisodeReward += s_.Reward;

                if (phase == Phase.TRAIN)
                {
                    // Build up episode memory, using reward for taking the action.
                    m_rgMemory.Add(new MemoryItem(s, x, action, rgfAprob, (float)s_.Reward));

                    // An episode has finished.
                    if (s_.Done)
                    {
                        nEpisode++;
                        nIteration++;

                        m_brain.Reshape(m_rgMemory);

                        // Compute the discounted reward (backwards through time)
                        float[] rgDiscountedR = m_rgMemory.GetDiscountedRewards(m_fGamma, m_bAllowDiscountReset);
                        // Rewards are standardized when set to be unit normal (helps control the gradient estimator variance)
                        m_brain.SetDiscountedR(rgDiscountedR);

                        // Get the action probabilities.
                        float[] rgfAprobSet = m_rgMemory.GetActionProbabilities();
                        // The action probabilities are used to calculate the initial gradient within the loss function.
                        m_brain.SetActionProbabilities(rgfAprobSet);

                        // Get the action one-hot vectors.  When using Softmax, this contains the one-hot vector containing
                        // each action set (e.g. 3 actions with action 0 set would return a vector <1,0,0>).
                        // When using a binary probability (e.g. with Sigmoid), the each action set only contains a
                        // single element which is set to the action value itself (e.g. 0 for action '0' and 1 for action '1')
                        float[] rgfAonehotSet = m_rgMemory.GetActionOneHotVectors();
                        m_brain.SetActionOneHotVectors(rgfAonehotSet);

                        // Train for one iteration, which triggers the loss function.
                        List <Datum> rgData = m_rgMemory.GetData();
                        List <Datum> rgClip = m_rgMemory.GetClip();
                        m_brain.SetData(rgData, rgClip);
                        m_brain.Train(nIteration, step);

                        // Update reward running
                        if (!dfRunningReward.HasValue)
                        {
                            dfRunningReward = dfEpisodeReward;
                        }
                        else
                        {
                            dfRunningReward = dfRunningReward * 0.99 + dfEpisodeReward * 0.01;
                        }

                        updateStatus(nIteration, nEpisode, dfEpisodeReward, dfRunningReward.Value);
                        dfEpisodeReward = 0;

                        s = getData(phase, -1);
                        m_rgMemory.Clear();

                        if (step != TRAIN_STEP.NONE)
                        {
                            return;
                        }
                    }
                    else
                    {
                        s = s_;
                    }
                }
                else
                {
                    if (s_.Done)
                    {
                        nEpisode++;

                        // Update reward running
                        if (!dfRunningReward.HasValue)
                        {
                            dfRunningReward = dfEpisodeReward;
                        }
                        else
                        {
                            dfRunningReward = dfRunningReward * 0.99 + dfEpisodeReward * 0.01;
                        }

                        updateStatus(nIteration, nEpisode, dfEpisodeReward, dfRunningReward.Value);
                        dfEpisodeReward = 0;

                        s = getData(phase, -1);
                    }
                    else
                    {
                        s = s_;
                    }

                    nIteration++;
                }
            }
        }