public double Sample(Agent agent, int horizon)
        {
            double reward = 0.0;

            if (horizon == 0) {
                return (int)reward;
            }
            else if (this.Type == ChanceNode) {
                var percept = agent.GeneratePerceptAndUpdate();
                int observation = percept.Item1;
                int randomReward = percept.Item2;

                if (!this.Children.ContainsKey(observation)) {//new node ->add it as decision node
                    this.Children[observation] = new MonteCarloSearchNode(DecisionNode);
                }
                MonteCarloSearchNode observationChild = this.Children[observation];

                reward = randomReward + observationChild.Sample(agent, horizon-1);
            }
            else if (this.Visits == 0) //unvisited decision node or we have exceeded maximum tree depth
            {
                reward = agent.Playout(horizon);
            //                Console.WriteLine("from playout: reward ="+reward);
            }
            else { //Previously visited decision node

                int actionNullable = this.SelectAction(agent);
                int action = actionNullable;

                agent.ModelUpdateAction(action);

                if (!this.Children.ContainsKey(action)){    //this action is new chance child
                    this.Children[action]=new MonteCarloSearchNode(ChanceNode);
                }
                MonteCarloSearchNode actionChild = this.Children[action];

                reward = actionChild.Sample(agent, horizon);   //it is not clear if not horizon-1. (asks pyaixi)
            }

            double visitsDouble = this.Visits;
            //Console.WriteLine("> {3} - {0}, {1}, {2}", this.mean, reward, (reward + (visitsDouble * this.mean) / (visitsDouble + 1.0)), visitsDouble);
            this.Mean = (reward + (visitsDouble*this.Mean)) / (1.0 + visitsDouble);
            this.Visits = this.Visits+1;

            return reward;
        }
Beispiel #2
0
        //Interaction loop for interaction between agent and environment.
        //This part is done in BrainSimulator in other version
        // interaction begins with generating observation and reward from environment and giving it to agent
        // agent then generates action and cycle repeats.
        public static void InteractionLoop(Agent agent, AIXIEnvironment env, Dictionary<string, string> options)
        {
            Random rnd;
            if (options.ContainsKey("random-seed"))
            {
                int seed;
                int.TryParse(options["random-seed"], out seed);
                rnd = new Random(seed);
            }
            else
            {
                rnd = new Random();
            }

            // Exploration = try random action
            // probability will decay exponentially as exploreRate * exploreDecay ** round_number
            var exploreRate = 0.0;
            if (options.ContainsKey("exploration"))
            {
                exploreRate = Utils.MyToDouble(options["exploration"]);
            }
            var explore = exploreRate > 0;

            var exploreDecay = 0.0;
            if (options.ContainsKey("explore-decay"))
            {
                exploreDecay = Utils.MyToDouble(options["explore-decay"]);
            }

            Debug.Assert(0.0 <= exploreRate);
            Debug.Assert(0.0 <= exploreDecay && exploreDecay <= 1.0);

            //automatic halting after certain number of rounds
            var terminateAge = 0;
            if (options.ContainsKey("terminate-age"))
            {
                terminateAge = Convert.ToInt32(options["terminate-age"]);
            }
            var terminateCheck = terminateAge > 0;
            Debug.Assert(0 <= terminateAge);

            // when learning period passes, agent will stop changing/improving model and just use it.
            var learningPeriod = 0;
            if (options.ContainsKey("learning-period"))
            {
                learningPeriod = Convert.ToInt32(options["learning-period"]);
            }
            Debug.Assert(0 <= learningPeriod);

            var cycle = 0;
            while (!env.IsFinished)
            {

                if (terminateCheck && agent.Age > terminateAge)
                {
                    break;
                }
                var cycleStartTime = DateTime.Now;
                var observation = env.Observation;
                var reward = env.Reward;

                if (learningPeriod > 0 && cycle > learningPeriod)
                {
                    explore = false;
                }

                //give observation and reward to agent.
                agent.ModelUpdatePercept(observation, reward);

                var explored = false;
                int action;

                if (explore && rnd.NextDouble() < exploreRate)
                {
                    explored = true;
                    action = agent.GenerateRandomAction();
                }
                else
                {
                    //get agents response to observation and reward
                    action = agent.Search();
                }

                //pass agent's action to environment
                env.PerformAction(action);
                agent.ModelUpdateAction(action);

                var timeTaken = DateTime.Now - cycleStartTime;

                Console.WriteLine("{0}:\t{1},{2},{3}\t{4},{5}  \t{6},{7}\t>{8},{9}",
                    cycle, observation, reward, action,
                    explored, exploreRate,
                    agent.TotalReward, agent.AverageReward(),
                    timeTaken, agent.ModelSize()
                    );

                if (explore)
                {
                    exploreRate *= exploreDecay;
                }
                cycle += 1;
            }
        }
        public int SelectAction(Agent agent)
        {
            Debug.Assert(agent.MaximumReward() != null, "this is weird place, - in selection action");

            double exploreBias = (double)agent.Horizon * agent.MaximumReward().Value;
            double explorationNumerator = this.ExplorationConstant * Math.Log(this.Visits);
            int bestAction = -1;
            double bestPriority = double.NegativeInfinity;

            foreach (int action in agent.Environment.ValidActions) {
                MonteCarloSearchNode node=null;
                if (this.Children.ContainsKey(action)) {
                    node=this.Children[action];
                }
                double priority;
                if (node == null || node.Visits == 0) {
                    // previously unexplored node
                    priority = this.UnexploredBias;    //unexplored bias
                }
                else{
                    priority = node.Mean + exploreBias * Math.Sqrt(explorationNumerator / node.Visits);
                }

                if (priority > (bestPriority+Utils.RandomDouble(0, 0.001))){
                    bestAction=action;
                    bestPriority=priority;
                }

            }
            return bestAction;
        }