public override void backward(double reward) { this.latest_reward = reward; this.average_reward_window.add(reward); this.reward_window.RemoveAt(0); this.reward_window.Add(reward); if (!this.learning) { return; } // various book-keeping this.age += 1; // it is time t+1 and we have to store (s_t, a_t, r_t, s_{t+1}) as new experience // (given that an appropriate number of state measurements already exist, of course) if (this.forward_passes > this.temporal_window + 1) { var e = new ExperienceShared(); var n = this.window_size; e.state0 = this.net_window[n - 2]; e.action0 = this.action_window[n - 2]; e.reward0 = this.reward_window[n - 2]; e.state1 = this.net_window[n - 1]; e.agent = this.instance; // maintain dictionary of agent average rewards //agentAvgRewards.AddOrUpdate(e.agent,this.average_reward_window.get_average()); //if (agentAvgRewards.ContainsKey(e.agent)) //{ // agentAvgRewards[e.agent] = this.average_reward_window.get_average(); //} //else //{ // agentAvgRewards.Add(e.agent, this.average_reward_window.get_average()); //} // save experience from the "best" agent with the highest average reward //var maxAgent = agentAvgRewards.FirstOrDefault(x => x.Value == agentAvgRewards.Values.ToList().Max()).Key; //if (e.agent != minAgent) // save experience except from the "worst" agent with the lowest average reward //var minAgent = agentAvgRewards.Aggregate((x, y) => x.Value < y.Value ? x : y).Key; //if (e.agent != minAgent) // save experience from all agents if (DeepQLearnShared.experienceShared.Count < this.experience_size) { var ix = (DeepQLearnShared.experienceShared.Count == 0) ? 0 : experienceShared.Count; if (e != null) { DeepQLearnShared.experienceShared.TryAdd(ix, e); } } else if (this.experience_size > 0) { // replace. finite memory! need to seed random generator per instance, otherwise distribution not even var ri = new Random(Int32.Parse(this.instance)).Next(0, this.experience_size); if (e != null) { DeepQLearnShared.experienceShared[ri] = e; } } } // learn based on experience, once we have some samples to go on // this is where the magic happens... if (DeepQLearnShared.experienceShared.Count > this.start_learn_threshold) { var avcost = 0.0; for (var k = 0; k < this.tdtrainer.batch_size; k++) { int i = 0; ExperienceShared e; do { var re = util.randi(0, DeepQLearnShared.experienceShared.Count); e = DeepQLearnShared.experienceShared[re]; i++; }while (e == null || i > 10); var x = new Volume(1, 1, this.net_inputs); x.w = e.state0; var maxact = this.policy(e.state1); var r = e.reward0 + this.gamma * maxact.value; var ystruct = new Entry { dim = e.action0, val = r }; var loss = this.tdtrainer.train(x, ystruct); avcost += double.Parse(loss["loss"]); } avcost = avcost / this.tdtrainer.batch_size; this.average_loss_window.add(avcost); } }
public void Update(int index, ExperienceShared es) { experienceShared[index] = es; }
public override void backward(double reward) { this.latest_reward = reward; this.average_reward_window.add(reward); this.reward_window.RemoveAt(0); this.reward_window.Add(reward); if (!this.learning) { return; } // various book-keeping this.age += 1; // it is time t+1 and we have to store (s_t, a_t, r_t, s_{t+1}) as new experience // (given that an appropriate number of state measurements already exist, of course) if (this.forward_passes > this.temporal_window + 1) { var e = new ExperienceShared(); var n = this.window_size; e.state0 = this.net_window[n - 2]; e.action0 = this.action_window[n - 2]; e.reward0 = this.reward_window[n - 2]; e.state1 = this.net_window[n - 1]; e.agent = this.instance; // save experience from all agents if (ExperienceSharedSingleton.Instance().experienceShared.Count < this.experience_size) { if (e != null) { ExperienceSharedSingleton.Instance().experienceShared.Add(e); } } else if (this.experience_size > 0) { // replace. finite memory! need to seed random generator per instance, otherwise distribution not even var ri = new Random(Int32.Parse(this.instance)).Next(0, this.experience_size); if (e != null) { ExperienceSharedSingleton.Instance().Update(ri, e); } } } // learn based on experience, once we have some samples to go on // this is where the magic happens... if (ExperienceSharedSingleton.Instance().experienceShared.Count > this.start_learn_threshold) { var avcost = 0.0; for (var k = 0; k < this.tdtrainer.batch_size; k++) { int i = 0; ExperienceShared e; do { var re = util.randi(0, ExperienceSharedSingleton.Instance().experienceShared.Count); e = ExperienceSharedSingleton.Instance().Retrieve(re); i++; }while (e == null || i > 10); var x = new Volume(1, 1, this.net_inputs); x.w = e.state0; var maxact = this.policy(e.state1); var r = e.reward0 + this.gamma * maxact.value; var ystruct = new Entry { dim = e.action0, val = r }; var loss = this.tdtrainer.train(x, ystruct); avcost += double.Parse(loss["loss"]); } avcost = avcost / this.tdtrainer.batch_size; this.average_loss_window.add(avcost); } }
public void Add(ExperienceShared es) { experienceShared.Add(es); }