public void doTask() { ModelUndo mu = new ModelUndo(m_agent); TimeSpan elapsed = DateTime.Now - m_timer; try { do { lock (m_stat_mutex) { if (m_samples >= m_num_simulations) { break; } m_samples++; } m_agent.spc.gbl_search_root.sample(m_agent, m_rng, 0); elapsed = DateTime.Now - m_timer; } while (Math.Abs(elapsed.TotalMilliseconds) < m_time_limit); } catch (Exception e) { //m_agent.logbox.Items.Add(e.Message); // m_agent.logbox.Items.Add(e.StackTrace); // m_agent.logbox.Items.Add(e.InnerException); } }
/* selects the best action determined by the MCTS statistics */ public action_t selectBestMCTSAction(Agent agent) { ModelUndo mu = new ModelUndo(agent); action_t best_action = agent.selectRandomAction(agent.rng); double best_exp = double.NegativeInfinity; bool found = false; for (UInt64 a = 0; a < agent.numActions(); a++) { SearchNode n = agent.spc.findNode(agent.hashAfterAction(a)); if (n != null) { double noise = agent.rng.NextDouble() * 0.0001; double exp = n.expectation() + noise; if (exp > best_exp) { best_exp = n.expectation(); best_action = a; found = true; } // agent.logbox.Items.Add("action " + a + ":" + exp + " visits " + n.visits() + " self-predicted probability :" + agent.getPredictedActionProb(a)); } } //agent.logbox.Items.Add(" selectBestMCTSAction=" + best_action +"found ="+found ); return(best_action); }
// perform a sample run through this node and it's children, // returning the accumulated reward from this sample run public double sample(Agent agent, Random rng, int dfr) { // TODO: implement if (dfr == (int)agent.horizon() * 2) { return(0.0); } ModelUndo undo = new ModelUndo(agent); double reward = 0.0; if (m_chance_node) { // handle chance nodes // generate a hypothetical percept symbol_list_t percept = new symbol_list_t(0); //(UInt64)(agent.m_obs_bits + agent.m_rew_bits)); agent.genPerceptAndUpdate(rng, percept); // extract the reward for this transition, and // update the agent model reward = agent.rewardFromPercept(percept); SearchNode n = agent.spc.findOrCreateNode(agent.hash(), false); reward += n.sample(agent, rng, dfr + 1); agent.modelRevert(undo); } else { // handle decision nodes lock (m_mutex) { // if we need to do a playout bool do_playout = visits() < MinVisitsBeforeExpansion || dfr >= MaxDistanceFromRoot || agent.spc.search_node_pool.Count >= (int)agent.spc.MaxSearchNodes; if (do_playout) { //m_mutex.unlock(); reward = playout(agent, rng, (int)agent.horizon() - dfr / 2); } else { // pick an action UInt64 a = selectAction(agent, rng); //m_mutex.unlock(); // update model, and recurse agent.modelUpdate(a); SearchNode n = agent.spc.findOrCreateNode(agent.hash(), true); reward = n.sample(agent, rng, dfr + 1); agent.modelRevert(undo); } } } { // update our statistics for this node lock (m_mutex) { double vc = (double)(m_visits); m_mean = (m_mean * vc + reward) / (vc + 1.0); m_visits++; } } return(reward); }
public action_t naiveMonteCarlo(Agent agent) { DateTime ti = DateTime.Now; TimeSpan elapsed = ti - ti; // determine the depth and number of seconds to search double time_limit_ms = double.Parse((string)agent.options["cycle-length-ms"]); double time_limit = time_limit_ms / 1000.0; // sufficient statistics to compute the sample mean for each action // std::vector<std::pair<reward_t, double> > r(agent.numActions()); double [] rfirst = new double [agent.numActions()]; double [] rsecond = new double[agent.numActions()]; for (int i = 0; i < (int)agent.numActions(); i++) { rfirst[i] = rsecond[i] = 0.0; } ModelUndo mu = new ModelUndo(agent); UInt64 total_samples = 0; UInt64 start_hist_len = agent.historySize(); do // we ensure each action always has one estimate { for (UInt64 i = 0; i < agent.numActions(); i++) { // make action agent.modelUpdate(i); // grab percept and determine immediate reward symbol_list_t percept = new symbol_list_t(0); //agent.preceptBits ()); agent.genPerceptAndUpdate(agent.rng, percept); double reward = agent.rewardFromPercept(percept); // playout the remainder of the sequence reward += playout(agent, agent.rng, agent.horizon() - 1); rfirst[i] += reward; rsecond[i] += 1.0; agent.modelRevert(mu); //assert(start_hist_len == agent.historySize()); total_samples++; } elapsed = DateTime.Now - ti; } while (Math.Abs(elapsed.TotalMilliseconds) < time_limit_ms); // determine best arm, breaking ties arbitrarily double best = double.NegativeInfinity; action_t best_action = 0; for (int i = 0; i < (int)agent.numActions(); i++) { // assert(r[i].second > 0.0); double noise = agent.rng.NextDouble() * 0.0001; double x = rfirst[i] / rsecond[i] + noise; if (x > best) { best = x; best_action = (UInt64)i; } } //agent.logbox.Items.Add( "naive monte-carlo decision based on " + total_samples + " samples."); for (int i = 0; i < (int)agent.numActions(); i++) { //agent.logbox.Items.Add("action " + i + ": " +( rfirst[i] / rsecond[i])); } //agent.logbox.Items.Add(" best_action:" + best_action); return(best_action); }
// revert the agent's internal model of the world // to that of a previous time cycle, false on failure public bool modelRevert(ModelUndo mu) { // return false; // TODONE: implement // assert(m_ct->historySize() > mu.historySize()); // assert(!m_use_self_model || m_self_model->historySize() > mu.historySize()); if (m_time_cycle < mu.age()) { return(false); } // agent properties must be reverted before context update, // since the predicates that depend on the context may // depend on them m_time_cycle = mu.age(); m_hash = mu.hash(); m_total_reward = mu.reward(); m_last_update_percept = mu.lastUpdatePercept(); // revert the context tree and history back to it's previous state if (mu.lastUpdatePercept()) { // if we are undoing an action m_ct.revertHistory(mu.historySize()); if (m_use_self_model) { Int64 end_size = (Int64)m_self_model.historySize(); for (Int64 i = 0; i < (Int64)end_size - (Int64)mu.historySize(); i++) { m_self_model.revert(); } } } else { // if we are undoing an observation / reward Int64 end_size = (Int64)m_ct.historySize(); Int64 percept_bits = (Int64)(m_obs_bits + m_rew_bits); Int64 lim = (Int64)end_size - (Int64)mu.historySize(); for (Int64 i = 0; i < (Int64)end_size - (Int64)mu.historySize(); i++) { //ORIGINAL :: m_ct.revert(percept_bits - i - 1); Int64 offset = percept_bits - i - 1; m_ct.revert(); for (Int64 ix = 0; ix < (Int64)m_ct.size(); ix++) { if (ix != offset) { m_ct.m_history.pop_back(); } } } if (m_use_self_model) { m_self_model.revertHistory(mu.historySize()); } } //assert(!m_use_self_model || m_self_model.historySize() == m_ct.historySize()); return(true); }