public override ReinforcementLearningAction ActionForState(ReinforcementLearningState state, ReinforcementLearningQStore qFunction)
 {
     Random r = new Random();
     if (r.NextDouble() > EPSILON)
     {
         return new InvertedPendulumAction2((float)r.NextDouble());
     }
     else
     {                
         ReinforcementLearningAction ret;
         float value;
         qFunction.GetBestActionAndUtilityForState(state, out ret, out value);
         return ret;
     }
     
 }
        public virtual ReinforcementLearningAction ActionForState(ReinforcementLearningState state, ReinforcementLearningQStore qFunction)
        {
            ReinforcementLearningAction[] actions;
            float[] probs;
            ActionProbabilities(state, qFunction, out actions, out probs);

            float rand = (float)r.NextDouble();

            int i = 0;
            float sum = 0;
            do
            {
                sum += probs[i]; ++i;
            }
            while ((i < actions.Length) && (sum < rand));

            return actions[i - 1];
        }
        public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities)
        {
            ReinforcementLearningAction action;
            float utility;

            actions = state.GetActions();
            probabilities = new float[actions.Length];

            qFunction.GetBestActionAndUtilityForState(state, out action, out utility);

            for (int i = 0; i < actions.Length; ++i)
            {
                if (actions[i].Equals(action))
                {
                    probabilities[i] = Epsilon;
                }
                else probabilities[i] = (1 - Epsilon) / (actions.Length - 1);
            }            
        }
        public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float value)
        {
            
            InvertedPendulumAction2 bestaction = null;
            float bestq = float.MinValue;
            for (int i = 0; i < 10; ++i)
            {
                InvertedPendulumAction2 act = new InvertedPendulumAction2((float)r.NextDouble() * 2 - 1);
                float q = Evaluate(state, act);
                if (q > bestq)
                {
                    bestq = q;
                    bestaction = act;
                }
            }

            action = bestaction;
            value = bestq;   
        }
      public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action)
      {
          int i;
          int j;
          int k;
          GetIndices((InvertedPendulumState)state, (InvertedPendulumAction)action, out i, out j, out k);    
 
          return value[i,j,k];
      }
 public ReinforcementLearningAction[] GetAllActionsInState(ReinforcementLearningState state)
 {
     if (allActions == null)
     {
         List<CarNavigationAction> list = new List<CarNavigationAction>();
         for (int i = 0; i < LENACTION; ++i)
         {
             CarNavigationAction action;
             GetAction(i, out action);
             list.Add(action);
         }
         allActions = list.ToArray();
     }
     return allActions;
 }
        public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float retval)
        {
            float max = float.MinValue;
            int maxact = 0;
            
            int j;
            int k;
            int l;
            GetStateIndices((CarNavigationState)state, out j, out k, out l);


            for (int i = 0; i < LENACTION; ++i)
            {                
                if (value[i, j, k, l] > max)
                {
                    max = value[i, j, k, l];
                    maxact = i;
                }
            }

            if (value[(int)Math.Ceiling(LENACTION / 2.0), j, k, l] == max) maxact = (int)Math.Ceiling(LENACTION / 2.0);

            CarNavigationAction retaction;
            GetAction(maxact, out retaction);
            action = retaction;
            retval = max;
        }
        public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action)
        {
            int i;
            int j;
            int k;
            int l;
            GetIndices((CarNavigationState)state, (CarNavigationAction)action, out i, out j, out k, out l);

            return value[i, j, k, l];
        }
 public new ReinforcementLearningAction ActionForState(ReinforcementLearningState state, ReinforcementLearningQStore qFunction)
 {
     ReinforcementLearningAction action;
     float utility;
     qFunction.GetBestActionAndUtilityForState(state, out action, out utility);
     return action;
 }
 public void EpisodeBegin()
 {
     prevAction = null;
     prevState = null;            
 }
        public virtual ReinforcementLearningAction Step(ReinforcementLearningEnvironment env, ReinforcementLearningPolicy policy)
        {
            
            ReinforcementLearningState state = env.State();

            float reward = env.Reward();

            ReinforcementLearningAction action = policy.ActionForState(state, qFunction);

            if ((prevState != null) && (prevAction != null))
            {
                ReinforcementLearningAction bestAction;
                float Qtp1max;
                qFunction.GetBestActionAndUtilityForState(state, out bestAction, out Qtp1max);

                float Qt = qFunction.Evaluate(prevState, prevAction);

                float deltaQ = Alpha * (reward + Discount * Qtp1max - Qt);

                if (float.IsNaN(deltaQ)) throw new Exception();

                qFunction.ModifyValue(prevState, prevAction, deltaQ);
            }

            prevAction = action;
            prevState = state;
           
            return action;
        }        
        public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities)
        {
            actions = state.GetActions();
            probabilities = new float[actions.Length];
            float maxq = float.MinValue;
            for(int i=0; i<actions.Length; ++i)
            {
                float q = qFunction.Evaluate(state, actions[i]);
                probabilities[i] = q;
                if (q > maxq) maxq = q;
            }

            float sum = 0;

            for (int i = 0; i < actions.Length; ++i)
            {
                probabilities[i] = (float)Math.Exp((probabilities[i] - maxq) / Temperature);
                sum += probabilities[i];                
            }

            for (int i = 0; i < actions.Length; ++i)
            {
                probabilities[i] /= sum;                
            }
        }
 public override void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities)
 {
     throw new NotImplementedException();
 }
 public void ModifyValue(ReinforcementLearningState state, ReinforcementLearningAction action, float delta)
 {
     throw new NotImplementedException();
 }
        public void ModifyValue(ReinforcementLearningState state, ReinforcementLearningAction action, float delta)
        {
            int i;
            int j;
            int k;
            GetIndices((InvertedPendulumState)state, (InvertedPendulumAction)action, out i, out j, out k);      
            
            value[i, j, k] += delta;


            //for (int jj = j - DD; jj <= j + DD; ++jj)
            //{
            //    for (int kk = Math.Max(k - DD, 0); kk <= Math.Min(k + DD, LEN - 1); ++kk)
            //    {
            //        value[i, (jj + LEN) % LEN, kk] += (float)Math.Exp(-((j - jj) * (j - jj) + (k - kk) * (k - kk)) / DD / DD) / DD / DD * delta;
            //    }
            //}
        }
 public abstract void ActionProbabilities(ReinforcementLearningState state, ReinforcementLearningQStore qFunction, out ReinforcementLearningAction[] actions, out float[] probabilities);
        public void GetBestActionAndUtilityForState(ReinforcementLearningState state, out ReinforcementLearningAction action, out float retval)
        {
            float max = float.MinValue;
            int maxact = 0;

            int i;
            int j;
            int k;
            GetStateIndices((InvertedPendulumState)state, out j, out k);      

            for (int act = -1; act <= 1; ++act)
            {
                GetActionIndices(new InvertedPendulumAction(act), out i);              
                if (value[i, j, k] > max)
                {
                    max = value[i, j, k];
                    maxact = act;
                }
            }
            if (value[1, j, k] == max) maxact = 0;


            action = new InvertedPendulumAction(maxact);
            retval = max;
        }        
 public float Evaluate(ReinforcementLearningState state, ReinforcementLearningAction action)
 {
     return (float)igmn.Recall(new Vector(new double[] { ((InvertedPendulumState)state).a, ((InvertedPendulumState)state).w, ((InvertedPendulumAction2)action).action, 0 })).Elements[0];
 }