示例#1
0
        public void Learn()
        {
            List <FloatTensor> rewards_list = new List <FloatTensor>();
            List <FloatTensor> losses_list  = new List <FloatTensor>();

            for (int i = 0; i < history.Count; i++)
            {
                if (history[i][1] != null)
                {
                    rewards_list.Add(history[i][1]);
                }

                if (history[i][0] != null)
                {
                    losses_list.Add(history[i][0]);
                }
            }

            FloatTensor rewards = Functional.Concatenate(this.controller.floatTensorFactory, rewards_list, 0);
            FloatTensor losses  = Functional.Concatenate(this.controller.floatTensorFactory, losses_list, 0);

            var norm_rewards = rewards.Sub(rewards.Mean()).Div(rewards.Std().Add(0.000001f));

            norm_rewards.Autograd = true;
            var policy_loss = norm_rewards.Mul(losses.Neg()).Sum(0);

            policy_loss.Backward();

            optimizer.Step(rewards.Shape[0], 0);

            history = new List <FloatTensor[]>();
        }