public void modelUpdate(action_t action) { if (!isActionOk(action)) { return; // should be assert } if (!m_last_update_percept == true) { return; // should be assert } // Update internal model symbol_list_t action_syms = new symbol_list_t(0); //(UInt64) m_actions_bits); encodeAction(action_syms, action); m_ct.update(action_syms); m_ct.updateHistory(action_syms); if (m_use_self_model) { m_self_model.update(action_syms); } m_hash = hashAfterSymbols(action_syms); m_time_cycle++; m_last_update_percept = false; }
public BuildModeGenerationGUI() { InitializeComponent(); Action = action_t.NONE; projectConfigs = new Hashtable(); ResoreSettings(); }
/* selects the best action determined by the MCTS statistics */ public action_t selectBestMCTSAction(Agent agent) { ModelUndo mu = new ModelUndo(agent); action_t best_action = agent.selectRandomAction(agent.rng); double best_exp = double.NegativeInfinity; bool found = false; for (UInt64 a = 0; a < agent.numActions(); a++) { SearchNode n = agent.spc.findNode(agent.hashAfterAction(a)); if (n != null) { double noise = agent.rng.NextDouble() * 0.0001; double exp = n.expectation() + noise; if (exp > best_exp) { best_exp = n.expectation(); best_action = a; found = true; } // agent.logbox.Items.Add("action " + a + ":" + exp + " visits " + n.visits() + " self-predicted probability :" + agent.getPredictedActionProb(a)); } } //agent.logbox.Items.Add(" selectBestMCTSAction=" + best_action +"found ="+found ); return(best_action); }
// probability of selecting an action according to the // agent's internal model of it's own behaviour public double getPredictedActionProb(action_t action) { //return 0; // TODONE: implement // actions are equally likely if no internal model is used if (!m_use_self_model) { return(1.0 / (double)(m_actions)); } // compute normalisation term, since some // actions may be illegal double tot = 0.0; symbol_list_t symlist = new symbol_list_t(0); //(UInt64)m_actions_bits); for (UInt64 a = 0; a < m_actions; a++) { encodeAction(symlist, a); tot += m_self_model.predict(symlist); } //assert(tot != 0.0); encodeAction(symlist, action); return(m_self_model.predict(symlist) / tot); }
/* hash of history if we were to make a particular action */ public hash_t hashAfterAction(action_t action) { //assert(isActionOk(action)); symbol_list_t action_syms = new symbol_list_t(0); //(UInt64)m_actions_bits); encodeAction(action_syms, action); return(hashAfterSymbols(action_syms)); }
public action_t search(Agent agent) { action_t best = 0; string controller = (string)agent.options["controller"]; if (controller == "mc") { best = naiveMonteCarlo(agent); } else if (controller == "mcts") { best = mcts(agent); } else { best = agent.selectRandomAction(agent.rng); } return(best); // TODO }
// determine the next action to play //determine the next child to explore, NULL if no such child exists public action_t selectAction(Agent agent, Random rng) { // TODONE: implement // higher values encourage more exploration, less exploitation double ExploreBias = agent.horizon() * agent.maxReward(); double UnexploredBias = 1000000000.0; //assert(!m_chance_node); action_t best_action = 0; double best_priority = double.NegativeInfinity; //Random rng = new Random (); for (UInt64 a = 0; a < agent.numActions(); a++) { SearchNode n = agent.spc.findNode(agent.hashAfterAction(a)); //assert(n == NULL || n->m_chance_node); double priority, noise = rng.NextDouble() * 0.0001; // use UCB formula to compute priority if (n == null || n.visits() == 0) { priority = UnexploredBias + noise; } else { double pvisits = (double)(visits()); double cvisits = (double)(n.visits()); double bias = ExploreBias * Math.Sqrt(2.0 * Math.Log(pvisits) / cvisits); priority = n.expectation() + bias + noise; } if (priority > best_priority) { best_action = a; best_priority = priority; } } return(best_action); }
/* convert a list of symbols to an action, false on failure */ bool symsToAction(symbol_list_t symlist, action_t action) { action = 0; //symbol_list_t::const_reverse_iterator it = symlist.rbegin(); //for (UInt64 c = 0; it != symlist.rend(); ++it, c++) { // if (*it == On) action |= (1 << c); //} UInt16 c = 0; foreach (bool bit in symlist.reverseIterator()) { if (bit == true) { action = action | ((UInt64)1 << c); } c++; } return(isActionOk(action)); }
private void cancel_Click(object sender, System.EventArgs e) { Action = action_t.NONE; SaveSettings(); this.Close(); }
public action_t naiveMonteCarlo(Agent agent) { DateTime ti = DateTime.Now; TimeSpan elapsed = ti - ti; // determine the depth and number of seconds to search double time_limit_ms = double.Parse((string)agent.options["cycle-length-ms"]); double time_limit = time_limit_ms / 1000.0; // sufficient statistics to compute the sample mean for each action // std::vector<std::pair<reward_t, double> > r(agent.numActions()); double [] rfirst = new double [agent.numActions()]; double [] rsecond = new double[agent.numActions()]; for (int i = 0; i < (int)agent.numActions(); i++) { rfirst[i] = rsecond[i] = 0.0; } ModelUndo mu = new ModelUndo(agent); UInt64 total_samples = 0; UInt64 start_hist_len = agent.historySize(); do // we ensure each action always has one estimate { for (UInt64 i = 0; i < agent.numActions(); i++) { // make action agent.modelUpdate(i); // grab percept and determine immediate reward symbol_list_t percept = new symbol_list_t(0); //agent.preceptBits ()); agent.genPerceptAndUpdate(agent.rng, percept); double reward = agent.rewardFromPercept(percept); // playout the remainder of the sequence reward += playout(agent, agent.rng, agent.horizon() - 1); rfirst[i] += reward; rsecond[i] += 1.0; agent.modelRevert(mu); //assert(start_hist_len == agent.historySize()); total_samples++; } elapsed = DateTime.Now - ti; } while (Math.Abs(elapsed.TotalMilliseconds) < time_limit_ms); // determine best arm, breaking ties arbitrarily double best = double.NegativeInfinity; action_t best_action = 0; for (int i = 0; i < (int)agent.numActions(); i++) { // assert(r[i].second > 0.0); double noise = agent.rng.NextDouble() * 0.0001; double x = rfirst[i] / rsecond[i] + noise; if (x > best) { best = x; best_action = (UInt64)i; } } //agent.logbox.Items.Add( "naive monte-carlo decision based on " + total_samples + " samples."); for (int i = 0; i < (int)agent.numActions(); i++) { //agent.logbox.Items.Add("action " + i + ": " +( rfirst[i] / rsecond[i])); } //agent.logbox.Items.Add(" best_action:" + best_action); return(best_action); }
// action sanity check public bool isActionOk(action_t action) { return(action < m_actions); }
// encoding/decoding actions and percepts to/from symbol lists void encodeAction(symbol_list_t symlist, action_t action) { symlist.clear(); encode(symlist, (int)action, m_actions_bits); }
private void enableCoverageClicked(object sender, System.EventArgs e) { Action = action_t.ENABLE_COVERAGE_CONFIG; SaveSettings(); this.Close(); }
private void disableCoverageClicked(object sender, EventArgs e) { Action = action_t.DISABLE_COVERAGE_CONFIG; this.Close(); }
public UInt64 m_reward; // the current reward // receives the agent's action and calculates the new environment percept public virtual void performAction(action_t action) { m_last_action = action; // return; // TODO: implement in inherited class }