Example #1
0
        public override void backward(double reward)
        {
            this.latest_reward = reward;
            this.average_reward_window.add(reward);

            this.reward_window.RemoveAt(0);
            this.reward_window.Add(reward);

            if (!this.learning)
            {
                return;
            }

            // various book-keeping
            this.age += 1;

            // it is time t+1 and we have to store (s_t, a_t, r_t, s_{t+1}) as new experience
            // (given that an appropriate number of state measurements already exist, of course)
            if (this.forward_passes > this.temporal_window + 1)
            {
                var e = new ExperienceShared();
                var n = this.window_size;
                e.state0  = this.net_window[n - 2];
                e.action0 = this.action_window[n - 2];
                e.reward0 = this.reward_window[n - 2];
                e.state1  = this.net_window[n - 1];
                e.agent   = this.instance;

                // maintain dictionary of agent average rewards
                //agentAvgRewards.AddOrUpdate(e.agent,this.average_reward_window.get_average());
                //if (agentAvgRewards.ContainsKey(e.agent))
                //{
                //    agentAvgRewards[e.agent] = this.average_reward_window.get_average();
                //}
                //else
                //{
                //    agentAvgRewards.Add(e.agent, this.average_reward_window.get_average());
                //}

                // save experience from the "best" agent with the highest average reward
                //var maxAgent = agentAvgRewards.FirstOrDefault(x => x.Value == agentAvgRewards.Values.ToList().Max()).Key;
                //if (e.agent != minAgent)
                // save experience except from the "worst" agent with the lowest average reward
                //var minAgent = agentAvgRewards.Aggregate((x, y) => x.Value < y.Value ? x : y).Key;
                //if (e.agent != minAgent)

                // save experience from all agents
                if (DeepQLearnShared.experienceShared.Count < this.experience_size)
                {
                    var ix = (DeepQLearnShared.experienceShared.Count == 0) ? 0 : experienceShared.Count;
                    if (e != null)
                    {
                        DeepQLearnShared.experienceShared.TryAdd(ix, e);
                    }
                }
                else if (this.experience_size > 0)
                {
                    // replace. finite memory! need to seed random generator per instance, otherwise distribution not even
                    var ri = new Random(Int32.Parse(this.instance)).Next(0, this.experience_size);
                    if (e != null)
                    {
                        DeepQLearnShared.experienceShared[ri] = e;
                    }
                }
            }

            // learn based on experience, once we have some samples to go on
            // this is where the magic happens...
            if (DeepQLearnShared.experienceShared.Count > this.start_learn_threshold)
            {
                var avcost = 0.0;
                for (var k = 0; k < this.tdtrainer.batch_size; k++)
                {
                    int i = 0;
                    ExperienceShared e;
                    do
                    {
                        var re = util.randi(0, DeepQLearnShared.experienceShared.Count);
                        e = DeepQLearnShared.experienceShared[re];
                        i++;
                    }while (e == null || i > 10);
                    var x = new Volume(1, 1, this.net_inputs);
                    x.w = e.state0;
                    var maxact = this.policy(e.state1);
                    var r      = e.reward0 + this.gamma * maxact.value;

                    var ystruct = new Entry {
                        dim = e.action0, val = r
                    };
                    var loss = this.tdtrainer.train(x, ystruct);
                    avcost += double.Parse(loss["loss"]);
                }

                avcost = avcost / this.tdtrainer.batch_size;
                this.average_loss_window.add(avcost);
            }
        }
Example #2
0
 public void Update(int index, ExperienceShared es)
 {
     experienceShared[index] = es;
 }
Example #3
0
        public override void backward(double reward)
        {
            this.latest_reward = reward;
            this.average_reward_window.add(reward);

            this.reward_window.RemoveAt(0);
            this.reward_window.Add(reward);

            if (!this.learning)
            {
                return;
            }

            // various book-keeping
            this.age += 1;

            // it is time t+1 and we have to store (s_t, a_t, r_t, s_{t+1}) as new experience
            // (given that an appropriate number of state measurements already exist, of course)
            if (this.forward_passes > this.temporal_window + 1)
            {
                var e = new ExperienceShared();
                var n = this.window_size;
                e.state0  = this.net_window[n - 2];
                e.action0 = this.action_window[n - 2];
                e.reward0 = this.reward_window[n - 2];
                e.state1  = this.net_window[n - 1];
                e.agent   = this.instance;

                // save experience from all agents
                if (ExperienceSharedSingleton.Instance().experienceShared.Count < this.experience_size)
                {
                    if (e != null)
                    {
                        ExperienceSharedSingleton.Instance().experienceShared.Add(e);
                    }
                }
                else if (this.experience_size > 0)
                {
                    // replace. finite memory! need to seed random generator per instance, otherwise distribution not even
                    var ri = new Random(Int32.Parse(this.instance)).Next(0, this.experience_size);
                    if (e != null)
                    {
                        ExperienceSharedSingleton.Instance().Update(ri, e);
                    }
                }
            }

            // learn based on experience, once we have some samples to go on
            // this is where the magic happens...
            if (ExperienceSharedSingleton.Instance().experienceShared.Count > this.start_learn_threshold)
            {
                var avcost = 0.0;
                for (var k = 0; k < this.tdtrainer.batch_size; k++)
                {
                    int i = 0;
                    ExperienceShared e;
                    do
                    {
                        var re = util.randi(0, ExperienceSharedSingleton.Instance().experienceShared.Count);
                        e = ExperienceSharedSingleton.Instance().Retrieve(re);
                        i++;
                    }while (e == null || i > 10);
                    var x = new Volume(1, 1, this.net_inputs);
                    x.w = e.state0;
                    var maxact = this.policy(e.state1);
                    var r      = e.reward0 + this.gamma * maxact.value;

                    var ystruct = new Entry {
                        dim = e.action0, val = r
                    };
                    var loss = this.tdtrainer.train(x, ystruct);
                    avcost += double.Parse(loss["loss"]);
                }

                avcost = avcost / this.tdtrainer.batch_size;
                this.average_loss_window.add(avcost);
            }
        }
Example #4
0
 public void Add(ExperienceShared es)
 {
     experienceShared.Add(es);
 }