// updates the history statistics, without touching the context tree public void updateHistory(symbol_list_t symlist) { for (UInt64 i = 0; i < symlist.size(); i++) { m_history.push_back(symlist.bits[(int)i]); } }
/* gives the estimated probability of observing a particular sequence */ public double predict(symbol_list_t symlist) { // if we haven't enough context to make an informed // prediction then guess uniformly randomly if (m_history.size() + symlist.size() <= m_depth) { double exp = -(double)(symlist.size()); return(Math.Pow(2.0, exp)); } // prob(sym1 ^ sym2 ^ ... | history) = prob(sym1 ^ sym2 ^ ... and history) / prob(history) double log_prob_history = logBlockProbability(); update(symlist); double log_prob_syms_and_history = logBlockProbability(); int it = 0; for (; it != symlist.bits.Count; ++it) { revert(); } return(Math.Exp(log_prob_syms_and_history - log_prob_history)); }
public void encodePercept(symbol_list_t symlist, UInt64 observation, UInt64 reward) { symlist.clear(); encode(symlist, (int)observation, m_obs_bits); encode(symlist, (int)reward, m_rew_bits); }
// probability of selecting an action according to the // agent's internal model of it's own behaviour public double getPredictedActionProb(action_t action) { //return 0; // TODONE: implement // actions are equally likely if no internal model is used if (!m_use_self_model) { return(1.0 / (double)(m_actions)); } // compute normalisation term, since some // actions may be illegal double tot = 0.0; symbol_list_t symlist = new symbol_list_t(0); //(UInt64)m_actions_bits); for (UInt64 a = 0; a < m_actions; a++) { encodeAction(symlist, a); tot += m_self_model.predict(symlist); } //assert(tot != 0.0); encodeAction(symlist, action); return(m_self_model.predict(symlist) / tot); }
// generate a percept distributed to our history statistics, and // update our mixture environment model with it /* generate a percept distributed to our history statistics, and * update our internal agent state. this is more efficient than calling * genPercept and modelUpdate separately. */ public void genPerceptAndUpdate(Random rng, symbol_list_t precept) { m_ct.genRandomSymbolsAndUpdate(rng, precept, (UInt64)(m_obs_bits + m_rew_bits)); nonCTModelUpdate(precept); //return 0; // TODONE: implement }
public void modelUpdate(action_t action) { if (!isActionOk(action)) { return; // should be assert } if (!m_last_update_percept == true) { return; // should be assert } // Update internal model symbol_list_t action_syms = new symbol_list_t(0); //(UInt64) m_actions_bits); encodeAction(action_syms, action); m_ct.update(action_syms); m_ct.updateHistory(action_syms); if (m_use_self_model) { m_self_model.update(action_syms); } m_hash = hashAfterSymbols(action_syms); m_time_cycle++; m_last_update_percept = false; }
// Encodes a value onto the end of a symbol list using "bits" symbols public void encode(symbol_list_t symlist, int value, int bits) { for (int i = 0; i < bits; i++, value /= 2) { bool sym = ((value & 1) != 0); symlist.push_back(sym); } }
/* hash of history if we were to make a particular action */ public hash_t hashAfterAction(action_t action) { //assert(isActionOk(action)); symbol_list_t action_syms = new symbol_list_t(0); //(UInt64)m_actions_bits); encodeAction(action_syms, action); return(hashAfterSymbols(action_syms)); }
public void update(symbol_list_t symlist) { // TODONE: implement int it = 0; for (; it != symlist.bits.Count; ++it) { update(symlist.bits[it]); } }
/* update the non-context tree part of an internal agent after receiving a percept */ public void nonCTModelUpdate(symbol_list_t percept) { if (m_use_self_model) { m_self_model.updateHistory(percept); } m_hash = hashAfterSymbols(percept); m_total_reward += rewardFromPercept(percept); m_last_update_percept = true; }
// generate a specified number of random symbols // distributed according to the context tree statistics public void genRandomSymbols(Random rng, symbol_list_t symbols, UInt64 bits) { //Random rng = new Random(); genRandomSymbolsAndUpdate(rng, symbols, bits); // restore the context tree to it's original state for (UInt64 i = 0; i < bits; i++) { revert(); } }
// generate a specified number of random symbols distributed according to // the context tree statistics and update the context tree with the newly // generated bits public void genRandomSymbolsAndUpdate(Random rng, symbol_list_t symbols, UInt64 bits) { // TODONE: implement symbols.clear(); //Random rng = new Random(); for (UInt64 i = 0; i < bits; i++) { // flip a biased coin for each bit double prediction = predict(false); bool rand_sym = rng.NextDouble() < prediction ? false : true; symbols.push_back(rand_sym); update(rand_sym); // TODO: optimise this loop } }
// generate an action distributed according // to our history statistics public action_t genAction(Random rng) { // TODONE: implement symbol_list_t syms = new symbol_list_t(0); //(UInt64)m_actions_bits); UInt64 action = 0; // use rejection sampling to pick an action according // to our historical distribution do { m_self_model.genRandomSymbols(rng, syms, (UInt64 )m_actions_bits); } while (!symsToAction(syms, action)); return(action); }
/* computes the resultant history hash after processing a set of symbols */ hash_t hashAfterSymbols(symbol_list_t new_syms) { hash_t rval = m_hash; // update the hash of the history //symbol_list_t::const_iterator it = new_syms.begin(); int it = 0; for (; it != new_syms.bits.Length; ++it) { rval = hashAfterSymbol(new_syms.bits[it], rval); } return(rval); }
// update the internal agent's model of the world // due to receiving a percept or performing an action public void modelUpdate(UInt64 observation, UInt64 reward) { // Update internal model symbol_list_t percept = new symbol_list_t(0); //(UInt64)m_obs_bits + (UInt64)m_rew_bits); encodePercept(percept, observation, reward); m_ct.update(percept); m_ct.updateHistory(percept); // Update other properties //m_total_reward += reward; //m_last_update_percept = true; nonCTModelUpdate(percept); }
void getContext(symbol_list_t context) { // if (!m_context_functor.empty()) // { // m_context_functor(context); // return; // } context.clear(); // history_t::const_reverse_iterator ri = m_history.rbegin(); int ri = m_history.mem.Count - 1; for (UInt64 c = 0; ri >= 0 && c < m_depth; --ri, c++) { context.push_back((bool)m_history.mem[(int)ri]); } }
/* convert a list of symbols to an action, false on failure */ bool symsToAction(symbol_list_t symlist, action_t action) { action = 0; //symbol_list_t::const_reverse_iterator it = symlist.rbegin(); //for (UInt64 c = 0; it != symlist.rend(); ++it, c++) { // if (*it == On) action |= (1 << c); //} UInt16 c = 0; foreach (bool bit in symlist.reverseIterator()) { if (bit == true) { action = action | ((UInt64)1 << c); } c++; } return(isActionOk(action)); }
/* create (if necessary) all of the nodes in the current context */ void createNodesInCurrentContext(symbol_list_t context) { CTNode ctn = m_root; for (UInt64 i = 0; i < context.size(); i++) { // scan context and make up new nodes as we go along // and insert in tree as necessary int lp = context.ibits((int)i); CTNode nxt_ctn = ctn.m_child[lp]; if (nxt_ctn == null) { //void *p = m_ctnode_pool.malloc(); //assert(p != NULL); // TODO: make more robust CTNode p = new CTNode(); ctn.m_child[lp] = p; nxt_ctn = p; } ctn = nxt_ctn; } }
/* simulate a path through a hypothetical future for the agent * within it's internal model of the world, returning the * accumulated reward. */ double playout(Agent agent, Random rng, int playout_len) { double start_reward = agent.reward(); //ptr_vector<ModelUndo> undos; Stack undos = new Stack(); for (int i = 0; i < playout_len; i++) { undos.Push(new ModelUndo(agent)); // generate action UInt64 a = agent.useSelfModel() ? agent.genAction(rng) : agent.selectRandomAction(rng); agent.modelUpdate(a); // generate percept symbol_list_t percept = new symbol_list_t(0); //agent.preceptBits ()); undos.Push(new ModelUndo(agent)); agent.genPerceptAndUpdate(rng, percept); } double rval = agent.reward() - start_reward; //boost::ptr_vector<ModelUndo>::reverse_iterator it = undos.rbegin(); //int it = 0; //for (; it != undos.rend(); ++it) //{ //agent.modelRevert(it); //} // POP ALL while (undos.Count > 0) { agent.modelRevert((ModelUndo)undos.Pop()); } return(rval); }
/* interprets a list of symbols as a reward */ public reward_t rewardFromPercept(symbol_list_t percept) { // assert(percept.size() == m_obs_bits_c + m_rew_bits_c); // symbol_list_t::const_reverse_iterator it = percept.rbegin(); //int it = 0; IEnumerator it = percept.reverseIterator().GetEnumerator(); if (m_base2_reward_encoding) { // base2 reward encoding int r = 0; for (int c = 0; c < m_rew_bits; it.MoveNext()) { //assert(it != percept.rend()); if ((bool)it.Current == true) { r |= (1 << c); } c++; } return((double)(r)); } // assume the reward is the number of on bits double reward = 0.0; it.MoveNext(); for (int c = 0; c < m_rew_bits; it.MoveNext()) { //assert(it != percept.rend()); if ((bool)it.Current == true) { reward += 1.0; } c++; } return(reward); }
// Decodes the value encoded on the end of a list of symbols int decode(symbol_list_t symlist, int bits) { //assert(bits <= symlist.size()); int value = 0; //IEnumerator it = symlist.reverseIterator().GetEnumerator(); //symbol_list_t::const_reverse_iterator it = symlist.rbegin(); //symbol_list_t::const_reverse_iterator end = it + bits; //for( ; it != end; ++it) { // value = (*it ? 1 : 0) + 2 * value; //} int it = 0; int end = it + bits; for (; it != end; ++it) { value = (symlist.bits[it] ? 1 : 0) + 2 * value; } return(value); }
// perform a sample run through this node and it's children, // returning the accumulated reward from this sample run public double sample(Agent agent, Random rng, int dfr) { // TODO: implement if (dfr == (int)agent.horizon() * 2) { return(0.0); } ModelUndo undo = new ModelUndo(agent); double reward = 0.0; if (m_chance_node) { // handle chance nodes // generate a hypothetical percept symbol_list_t percept = new symbol_list_t(0); //(UInt64)(agent.m_obs_bits + agent.m_rew_bits)); agent.genPerceptAndUpdate(rng, percept); // extract the reward for this transition, and // update the agent model reward = agent.rewardFromPercept(percept); SearchNode n = agent.spc.findOrCreateNode(agent.hash(), false); reward += n.sample(agent, rng, dfr + 1); agent.modelRevert(undo); } else { // handle decision nodes lock (m_mutex) { // if we need to do a playout bool do_playout = visits() < MinVisitsBeforeExpansion || dfr >= MaxDistanceFromRoot || agent.spc.search_node_pool.Count >= (int)agent.spc.MaxSearchNodes; if (do_playout) { //m_mutex.unlock(); reward = playout(agent, rng, (int)agent.horizon() - dfr / 2); } else { // pick an action UInt64 a = selectAction(agent, rng); //m_mutex.unlock(); // update model, and recurse agent.modelUpdate(a); SearchNode n = agent.spc.findOrCreateNode(agent.hash(), true); reward = n.sample(agent, rng, dfr + 1); agent.modelRevert(undo); } } } { // update our statistics for this node lock (m_mutex) { double vc = (double)(m_visits); m_mean = (m_mean * vc + reward) / (vc + 1.0); m_visits++; } } return(reward); }
public action_t naiveMonteCarlo(Agent agent) { DateTime ti = DateTime.Now; TimeSpan elapsed = ti - ti; // determine the depth and number of seconds to search double time_limit_ms = double.Parse((string)agent.options["cycle-length-ms"]); double time_limit = time_limit_ms / 1000.0; // sufficient statistics to compute the sample mean for each action // std::vector<std::pair<reward_t, double> > r(agent.numActions()); double [] rfirst = new double [agent.numActions()]; double [] rsecond = new double[agent.numActions()]; for (int i = 0; i < (int)agent.numActions(); i++) { rfirst[i] = rsecond[i] = 0.0; } ModelUndo mu = new ModelUndo(agent); UInt64 total_samples = 0; UInt64 start_hist_len = agent.historySize(); do // we ensure each action always has one estimate { for (UInt64 i = 0; i < agent.numActions(); i++) { // make action agent.modelUpdate(i); // grab percept and determine immediate reward symbol_list_t percept = new symbol_list_t(0); //agent.preceptBits ()); agent.genPerceptAndUpdate(agent.rng, percept); double reward = agent.rewardFromPercept(percept); // playout the remainder of the sequence reward += playout(agent, agent.rng, agent.horizon() - 1); rfirst[i] += reward; rsecond[i] += 1.0; agent.modelRevert(mu); //assert(start_hist_len == agent.historySize()); total_samples++; } elapsed = DateTime.Now - ti; } while (Math.Abs(elapsed.TotalMilliseconds) < time_limit_ms); // determine best arm, breaking ties arbitrarily double best = double.NegativeInfinity; action_t best_action = 0; for (int i = 0; i < (int)agent.numActions(); i++) { // assert(r[i].second > 0.0); double noise = agent.rng.NextDouble() * 0.0001; double x = rfirst[i] / rsecond[i] + noise; if (x > best) { best = x; best_action = (UInt64)i; } } //agent.logbox.Items.Add( "naive monte-carlo decision based on " + total_samples + " samples."); for (int i = 0; i < (int)agent.numActions(); i++) { //agent.logbox.Items.Add("action " + i + ": " +( rfirst[i] / rsecond[i])); } //agent.logbox.Items.Add(" best_action:" + best_action); return(best_action); }
// generate a percept distributed according // to our history statistics public void genPercept(Random rng, symbol_list_t symlist) { m_ct.genRandomSymbols(rng, symlist, (UInt64)m_obs_bits + (UInt64)m_rew_bits); //return 0; // TODONE: implement }
// get the agent's probability of receiving a particular percept //public double perceptProbability(UInt64 observation, UInt64 reward) public double perceptProbability(symbol_list_t percept) { //return 0; // TODONE: implement // assert(percept.size() == m_obs_bits_c + m_rew_bits_c); return(m_ct.predict(percept)); }
// encoding/decoding actions and percepts to/from symbol lists void encodeAction(symbol_list_t symlist, action_t action) { symlist.clear(); encode(symlist, (int)action, m_actions_bits); }
/* removes the most recently observed symbol from the context tree */ /* * void revert(UInt64 offset) { * * m_cts[offset].revert(); * for (size_t i=0; i < m_cts.size(); i++) { * if (i != offset) m_cts[i].m_history.pop_back(); * } * }*/ // removes the most recently observed symbol from the context tree public void revert() { // TODONE: implement if (m_history.size() == 0) { return; } // 1. remove the most recent symbol from the context buffer symbol_t sym = m_history.back(); m_history.pop_back(); // compute the current context symbol_list_t context = new symbol_list_t(0); //m_depth); // context.reserve((int)m_depth); getContext(context); // no need to undo a context tree update if there was // not enough context to begin with if (context.size() < m_depth) { return; } // 2. determine the path to the leaf nodes Stack path = new Stack(); path.Push(m_root); // add the path to the leaf nodes CTNode ctn = m_root; for (UInt64 i = 0; i < context.size() && ctn != null; i++) { ctn = ctn.m_child[context.ibits(i)]; path.Push(ctn); } // 3. update the probability estimates from the leaf node back up to the root, // deleting any superfluous nodes as we go for (; path.Count != 0; path.Pop()) { ctn = (CTNode)path.Peek(); //top(); if (ctn == null) { break; } // undo the previous KT estimate update ctn.m_count[sym? 1:0]--; double log_kt_mul = ctn.logKTMul(sym); ctn.m_log_prob_est -= log_kt_mul; // reclaim memory for any children nodes that now have seen no data for (UInt64 i = 0; i < 2; i++) { // bool sym = symbols[i]; bool my_sym = (i == 1); if (ctn.m_child[my_sym ? 1 : 0] != null && ctn.m_child[my_sym ? 1 : 0].visits() == 0) { //m_ctnode_pool.free(ctn.m_child[sym]); ctn.m_child[my_sym ? 1 : 0] = null; } } // update the weighted probabilities if (path.Count == (int)m_depth + 1) { ctn.m_log_prob_weighted = ctn.logProbEstimated(); } else { // computes P_w = log{0.5 * [P_kt + P_w0*P_w1]} double log_prob_on = ctn.child(true) != null?ctn.child(true).logProbWeighted() : 0.0; double log_prob_off = ctn.child(false) != null?ctn.child(false).logProbWeighted() : 0.0; double log_one_plus_exp = log_prob_off + log_prob_on - ctn.logProbEstimated(); // NOTE: no need to compute the log(1+e^x) if x is large, plus it avoids overflows if (log_one_plus_exp < 100.0) { log_one_plus_exp = Math.Log(1.0 + Math.Exp(log_one_plus_exp)); } ctn.m_log_prob_weighted = log_point_five + ctn.logProbEstimated() + log_one_plus_exp; } } }
UInt64 decodeAction(symbol_list_t symlist) { return((UInt64)decode(symlist, (int)m_actions_bits)); }
public UInt64 decodeReward(symbol_list_t symlist) { return((UInt64)decode(symlist, (int)m_rew_bits)); }
/* updates the context tree with a single symbol */ void update(symbol_t sym) { // TODONE: implement // compute the current context symbol_list_t context = new symbol_list_t(0); //m_depth); //context.reserve((int) m_depth); getContext(context); // if we have not seen enough context, append the symbol // to the history buffer and skip updating the context tree if (context.size() < m_depth) { m_history.push_back(sym); return; } // 1. create new nodes in the context tree (if necessary) createNodesInCurrentContext(context); // 2. walk down the tree to the relevant leaf context, saving the path as we go Stack path = new Stack(); path.Push(m_root); // add the empty context // add the path to the leaf nodes CTNode ctn = m_root; for (UInt64 i = 0; i < context.size(); i++) { ctn = ctn.m_child[context.ibits((int)i)]; path.Push(ctn); } // 3. update the probability estimates from the leaf node back up to the root for (; path.Count != 0; path.Pop()) { CTNode n = (CTNode)path.Peek(); // .Top(); // update the KT estimate and counts double log_kt_mul = n.logKTMul(sym); n.m_log_prob_est += log_kt_mul; n.m_count[(sym ? 1 : 0)]++; // update the weighted probabilities if (path.Count == (int)m_depth + 1) { n.m_log_prob_weighted = n.logProbEstimated(); } else { // computes P_w = log{0.5 * [P_kt + P_w0*P_w1]} double log_prob_on = n.child(true) != null?n.child(true).logProbWeighted() : 0.0; double log_prob_off = n.child(false) != null?n.child(false).logProbWeighted() : 0.0; double log_one_plus_exp = log_prob_off + log_prob_on - n.logProbEstimated(); // NOTE: no need to compute the log(1+e^x) if x is large, plus it avoids overflows if (log_one_plus_exp < 100.0) { log_one_plus_exp = Math.Log(1.0 + Math.Exp(log_one_plus_exp)); } n.m_log_prob_weighted = log_point_five + n.logProbEstimated() + log_one_plus_exp; } } // 4. save the new symbol to the context buffer m_history.push_back(sym); }