public void Learn() { List <FloatTensor> rewards_list = new List <FloatTensor>(); List <FloatTensor> losses_list = new List <FloatTensor>(); for (int i = 0; i < history.Count; i++) { if (history[i][1] != null) { rewards_list.Add(history[i][1]); } if (history[i][0] != null) { losses_list.Add(history[i][0]); } } FloatTensor rewards = Functional.Concatenate(this.controller.floatTensorFactory, rewards_list, 0); FloatTensor losses = Functional.Concatenate(this.controller.floatTensorFactory, losses_list, 0); var norm_rewards = rewards.Sub(rewards.Mean()).Div(rewards.Std().Add(0.000001f)); norm_rewards.Autograd = true; var policy_loss = norm_rewards.Mul(losses.Neg()).Sum(0); policy_loss.Backward(); optimizer.Step(rewards.Shape[0], 0); history = new List <FloatTensor[]>(); }