Beispiel #1
0
        public void doTask()
        {
            ModelUndo mu      = new ModelUndo(m_agent);
            TimeSpan  elapsed = DateTime.Now - m_timer;

            try
            {
                do
                {
                    lock (m_stat_mutex)
                    {
                        if (m_samples >= m_num_simulations)
                        {
                            break;
                        }
                        m_samples++;
                    }
                    m_agent.spc.gbl_search_root.sample(m_agent, m_rng, 0);
                    elapsed = DateTime.Now - m_timer;
                } while (Math.Abs(elapsed.TotalMilliseconds) < m_time_limit);
            }
            catch (Exception e)
            {
                //m_agent.logbox.Items.Add(e.Message);
                // m_agent.logbox.Items.Add(e.StackTrace);
                // m_agent.logbox.Items.Add(e.InnerException);
            }
        }
Beispiel #2
0
        /* selects the best action determined by the MCTS statistics */
        public action_t selectBestMCTSAction(Agent agent)
        {
            ModelUndo mu          = new ModelUndo(agent);
            action_t  best_action = agent.selectRandomAction(agent.rng);
            double    best_exp    = double.NegativeInfinity;
            bool      found       = false;

            for (UInt64 a = 0; a < agent.numActions(); a++)
            {
                SearchNode n = agent.spc.findNode(agent.hashAfterAction(a));
                if (n != null)
                {
                    double noise = agent.rng.NextDouble() * 0.0001;
                    double exp   = n.expectation() + noise;
                    if (exp > best_exp)
                    {
                        best_exp    = n.expectation();
                        best_action = a;
                        found       = true;
                    }
                    // agent.logbox.Items.Add("action " + a + ":" + exp + " visits " + n.visits() + " self-predicted probability :" + agent.getPredictedActionProb(a));
                }
            }

            //agent.logbox.Items.Add(" selectBestMCTSAction=" + best_action +"found ="+found );


            return(best_action);
        }
Beispiel #3
0
        // perform a sample run through this node and it's children,
        // returning the accumulated reward from this sample run
        public double sample(Agent agent, Random rng, int dfr)
        {
            // TODO: implement
            if (dfr == (int)agent.horizon() * 2)
            {
                return(0.0);
            }

            ModelUndo undo   = new ModelUndo(agent);
            double    reward = 0.0;

            if (m_chance_node)
            {                                                 // handle chance nodes
                // generate a hypothetical percept
                symbol_list_t percept = new symbol_list_t(0); //(UInt64)(agent.m_obs_bits + agent.m_rew_bits));
                agent.genPerceptAndUpdate(rng, percept);

                // extract the reward for this transition, and
                // update the agent model
                reward = agent.rewardFromPercept(percept);

                SearchNode n = agent.spc.findOrCreateNode(agent.hash(), false);
                reward += n.sample(agent, rng, dfr + 1);
                agent.modelRevert(undo);
            }
            else
            {  // handle decision nodes
                lock (m_mutex)
                {
                    // if we need to do a playout
                    bool do_playout =
                        visits() < MinVisitsBeforeExpansion ||
                        dfr >= MaxDistanceFromRoot ||
                        agent.spc.search_node_pool.Count >= (int)agent.spc.MaxSearchNodes;

                    if (do_playout)
                    {
                        //m_mutex.unlock();

                        reward = playout(agent, rng, (int)agent.horizon() - dfr / 2);
                    }
                    else
                    {
                        // pick an action
                        UInt64 a = selectAction(agent, rng);
                        //m_mutex.unlock();

                        // update model, and recurse
                        agent.modelUpdate(a);
                        SearchNode n = agent.spc.findOrCreateNode(agent.hash(), true);
                        reward = n.sample(agent, rng, dfr + 1);
                        agent.modelRevert(undo);
                    }
                }
            }

            { // update our statistics for this node
                lock (m_mutex)
                {
                    double vc = (double)(m_visits);
                    m_mean = (m_mean * vc + reward) / (vc + 1.0);
                    m_visits++;
                }
            }

            return(reward);
        }
Beispiel #4
0
        public action_t naiveMonteCarlo(Agent agent)
        {
            DateTime ti      = DateTime.Now;
            TimeSpan elapsed = ti - ti;

            // determine the depth and number of seconds to search
            double time_limit_ms = double.Parse((string)agent.options["cycle-length-ms"]);
            double time_limit    = time_limit_ms / 1000.0;

            // sufficient statistics to compute the sample mean for each action
            // std::vector<std::pair<reward_t, double> > r(agent.numActions());
            double [] rfirst  = new double [agent.numActions()];
            double [] rsecond = new double[agent.numActions()];

            for (int i = 0; i < (int)agent.numActions(); i++)
            {
                rfirst[i] = rsecond[i] = 0.0;
            }

            ModelUndo mu             = new ModelUndo(agent);
            UInt64    total_samples  = 0;
            UInt64    start_hist_len = agent.historySize();

            do            // we ensure each action always has one estimate
            {
                for (UInt64 i = 0; i < agent.numActions(); i++)
                {
                    // make action
                    agent.modelUpdate(i);

                    // grab percept and determine immediate reward
                    symbol_list_t percept = new symbol_list_t(0);            //agent.preceptBits ());
                    agent.genPerceptAndUpdate(agent.rng, percept);
                    double reward = agent.rewardFromPercept(percept);

                    // playout the remainder of the sequence
                    reward += playout(agent, agent.rng, agent.horizon() - 1);

                    rfirst[i]  += reward;
                    rsecond[i] += 1.0;

                    agent.modelRevert(mu);
                    //assert(start_hist_len == agent.historySize());

                    total_samples++;
                }
                elapsed = DateTime.Now - ti;
            } while (Math.Abs(elapsed.TotalMilliseconds) < time_limit_ms);

            // determine best arm, breaking ties arbitrarily
            double   best        = double.NegativeInfinity;
            action_t best_action = 0;

            for (int i = 0; i < (int)agent.numActions(); i++)
            {
                // assert(r[i].second > 0.0);
                double noise = agent.rng.NextDouble() * 0.0001;

                double x = rfirst[i] / rsecond[i] + noise;

                if (x > best)
                {
                    best        = x;
                    best_action = (UInt64)i;
                }
            }

            //agent.logbox.Items.Add( "naive monte-carlo decision based on " + total_samples + " samples.");

            for (int i = 0; i < (int)agent.numActions(); i++)
            {
                //agent.logbox.Items.Add("action " + i + ": " +( rfirst[i] / rsecond[i]));
            }
            //agent.logbox.Items.Add(" best_action:" + best_action);
            return(best_action);
        }
Beispiel #5
0
        // revert the agent's internal model of the world
        // to that of a previous time cycle, false on failure
        public bool modelRevert(ModelUndo mu)
        {
            // return false; // TODONE: implement
            // assert(m_ct->historySize() > mu.historySize());
            // assert(!m_use_self_model || m_self_model->historySize() > mu.historySize());

            if (m_time_cycle < mu.age())
            {
                return(false);
            }

            // agent properties must be reverted before context update,
            // since the predicates that depend on the context may
            // depend on them
            m_time_cycle          = mu.age();
            m_hash                = mu.hash();
            m_total_reward        = mu.reward();
            m_last_update_percept = mu.lastUpdatePercept();

            // revert the context tree and history back to it's previous state

            if (mu.lastUpdatePercept())
            { // if we are undoing an action
                m_ct.revertHistory(mu.historySize());
                if (m_use_self_model)
                {
                    Int64 end_size = (Int64)m_self_model.historySize();
                    for (Int64 i = 0; i < (Int64)end_size - (Int64)mu.historySize(); i++)
                    {
                        m_self_model.revert();
                    }
                }
            }
            else
            {
                // if we are undoing an observation / reward
                Int64 end_size     = (Int64)m_ct.historySize();
                Int64 percept_bits = (Int64)(m_obs_bits + m_rew_bits);
                Int64 lim          = (Int64)end_size - (Int64)mu.historySize();
                for (Int64 i = 0; i < (Int64)end_size - (Int64)mu.historySize(); i++)
                {
                    //ORIGINAL ::  m_ct.revert(percept_bits - i - 1);
                    Int64 offset = percept_bits - i - 1;
                    m_ct.revert();
                    for (Int64 ix = 0; ix < (Int64)m_ct.size(); ix++)
                    {
                        if (ix != offset)
                        {
                            m_ct.m_history.pop_back();
                        }
                    }
                }
                if (m_use_self_model)
                {
                    m_self_model.revertHistory(mu.historySize());
                }
            }

            //assert(!m_use_self_model || m_self_model.historySize() == m_ct.historySize());

            return(true);
        }