private void PolicyIteration()
        {
            StateValueTable valueTable = new StateValueTable(m_initMode, m_initValue,
                                                             this.EnvRows, this.EnvCols, // my pos
                                                             this.EnvRows, this.EnvCols, // opp pos
                                                             2);                         // ball ownership status

            m_valueTable = valueTable;

            Policy policy = new Policy(m_initMode, m_initMode == ValueTableInitModes.Constant ? 0 : m_numActions,
                                       this.EnvRows, this.EnvCols, // my pos
                                       this.EnvRows, this.EnvCols, // opp pos
                                       2);                         // ball ownership status

            m_piPolicy = policy;

            long stateCounts = m_valueTable.NumStates;


            while (true)
            {
                // Policy Evaluation

                double delta = 0.0;
                do
                {
                    delta = 0.0;

                    // foreach state
                    for (int s = 0; s < stateCounts; s++)
                    {
                        double v    = valueTable.GetValueLinear(s);
                        double newV = EstimateNewValueUsingPolicy(s, policy);
                        valueTable.SetValueLinear(newV, s);
                        delta = Math.Max(delta, Math.Abs(v - newV));
                    }
                } while (delta > Theta);

                // Policy Improvement
                bool policyStable = true;

                for (int s = 0; s < stateCounts; s++)
                {
                    int b = policy.GetValueLinear(s);
                    int bestAct;
                    EstimateNewValue((int)s, out bestAct);

                    policy.SetValueLinear(bestAct, s);
                    if (b != bestAct)
                    {
                        policyStable = false;
                    }
                }

                if (policyStable)
                {
                    break;
                }
            }
        }
        private double EstimateNewValueUsingPolicy(int s, Policy policy)
        {
            StateValueTable valueTable = m_valueTable as StateValueTable;

            if (valueTable == null)
            {
                throw new Exception("A ValueTable needed!");
            }

            List <int>    nextStates;
            List <double> nextStateProbs;
            List <double> nextStateRew;

            int a = policy.GetValueLinear(s);

            GetPossibleNextStates(s, a, out nextStates, out nextStateProbs, out nextStateRew);

            double nextV = 0.0;

            for (int i = 0; i < nextStates.Count; i++)
            {
                nextV += nextStateProbs[i] * (nextStateRew[i] + (Gamma * valueTable.GetValueLinear(nextStates[i])));
            }

            return(nextV);
        }
        private void ValueIteration()
        {
            StateValueTable valueTable = new StateValueTable(m_initMode, m_initValue,
                                                             this.EnvRows, this.EnvCols, // my pos
                                                             this.EnvRows, this.EnvCols, // opp pos
                                                             2);                         // ball ownership status

            m_valueTable = valueTable;

            long stateCounts = m_valueTable.NumStates;

            double delta = 0.0;

            do
            {
                delta = 0.0;

                // foreach state
                for (int s = 0; s < stateCounts; s++)
                {
                    double v    = valueTable.GetValueLinear(s);
                    double newV = EstimateNewValue(s);
                    valueTable.SetValueLinear(newV, s);
                    delta = Math.Max(delta, Math.Abs(v - newV));
                }
            } while (delta > Theta);
        }
        private double EstimateNewValue(int s, out int maxA)
        {
            StateValueTable valueTable = m_valueTable as StateValueTable;

            if (valueTable == null)
            {
                throw new Exception("A ValueTable needed!");
            }

            List <int>    nextStates;
            List <double> nextStateProbs;
            List <double> nextStateRew;

            double maxV = Double.MinValue;

            maxA = -1;

            for (int a = 0; a < m_numActions; a++)
            {
                GetPossibleNextStates(s, a, out nextStates, out nextStateProbs, out nextStateRew);

                double nextV = 0.0;
                for (int i = 0; i < nextStates.Count; i++)
                {
                    nextV += nextStateProbs[i] * (nextStateRew[i] + (Gamma * valueTable.GetValueLinear(nextStates[i])));
                }

                if (nextV > maxV)
                {
                    maxV = nextV;
                    maxA = a;
                }
            }

            return(maxV);
        }