void Backtrack(Rewards reward, PathPlanningGraph graph, int fromLevel, int stopAt = 0)
 {
     for (int t = fromLevel; t >= stopAt; t--)
     {
         int num = reward.totalRewards[t].Length;
         for (int i = 0; i < num; i++)
         {
             PlanningNode                    node  = graph[t].mNodes[i];
             List <PlanningEdge>             edges = graph[t].GetEdges(node);
             List <PlanningEdge> .Enumerator e     = edges.GetEnumerator();
             while (e.MoveNext())
             {
                 int j = graph[t + 1].GetIndex(e.Current.to);
                 if (reward.totalRewards[t + 1][j] > reward.futureRewards[t][i])
                 {
                     reward.futureRewards[t][i] = reward.totalRewards[t + 1][j];
                 }
             }
             reward.totalRewards[t][i] = reward.instantRewards[t][i] + reward.futureRewards[t][i];
         }
     }
 }
        HexaPos GetMax(Rewards reward, PathPlanningGraph graph, HexaPath path)
        {
            int     pathLen = path.Length;
            HexaPos lastPos = path[pathLen - 1];

            PlanningNode        lastNode = graph[pathLen - 1].GetNode(lastPos);
            List <PlanningEdge> edges    = graph[pathLen - 1].GetEdges(lastNode);
            double  maxVal = -0.01;
            HexaPos maxPos = null;

            List <PlanningEdge> .Enumerator e = edges.GetEnumerator();
            while (e.MoveNext())
            {
                int nextIdx = graph[pathLen].GetIndex(e.Current.to);
                if (reward.totalRewards[pathLen][nextIdx] > maxVal)
                {
                    maxPos = graph[pathLen].mNodes[nextIdx].pos;
                    maxVal = reward.totalRewards[pathLen][nextIdx];
                }
            }

            return(maxPos);
        }
        void UpdateRewardOptEst(Rewards reward, double[,] entropy, PathPlanningGraph graph, HexaPath path)
        {
            int currentLen = path.Length;
            int totalLen   = graph.planningLength;
            int num        = graph[currentLen].mNodes.Count;

            for (int i = 0; i < num; i++)
            {
                PlanningNode node    = graph[currentLen].mNodes[i];
                HexaPath     newpath = new HexaPath();
                newpath.AddPos(node.pos);
                double[,] localEntropy = (double[, ])entropy.Clone();
                _agent.Update(newpath, localEntropy);
                Rewards newreward = new Rewards(reward);
                // update estimation
                UpdateEstimation(newreward, localEntropy, graph, currentLen, totalLen - 1);
                // backtrack
                Backtrack(newreward, graph, totalLen - 2, currentLen);
                reward.instantRewards[currentLen][i] = newreward.instantRewards[currentLen][i];
                reward.futureRewards[currentLen][i]  = newreward.futureRewards[currentLen][i];
                reward.totalRewards[currentLen][i]   = newreward.totalRewards[currentLen][i];
            }
        }
        public override HexaPath FindPath(PathPlanningGraph graph, HexaPos start)
        {
            int planningLength = graph.planningLength;

            _estimated = new Rewards(graph);

            HexaPath path = new HexaPath();

            _optEstimated        = new Rewards(_estimated);
            _pesEstimated        = new Rewards(_estimated);
            double[,] optEntropy = (double[, ])(_localEntropy.Clone());
            double[,] pesEntropy = (double[, ])(_localEntropy.Clone());

            HexaPath optMaxPath = new HexaPath();
            HexaPath pesMaxPath = new HexaPath();

            optMaxPath.AddPos(start);
            pesMaxPath.AddPos(start);

            for (int t = 1; t < planningLength; t++)
            {
                // get path for opt
                _agent.Update(optMaxPath, optEntropy);
                UpdateRewardOptEst(_optEstimated, optEntropy, graph, optMaxPath);
                HexaPos nextOptPos = GetMax(_optEstimated, graph, optMaxPath);
                optMaxPath.AddPos(nextOptPos);

                // get path for pes
                _agent.Update(pesMaxPath, pesEntropy);
                UpdateRewardPesEst(_pesEstimated, pesEntropy, graph, pesMaxPath);
                HexaPos nextPesPos = GetMax(_pesEstimated, graph, pesMaxPath);
                pesMaxPath.AddPos(nextPesPos);
            }

            double optMaxScore = _agent.Score(optMaxPath, _localEntropy);
            double pesMaxScore = _agent.Score(pesMaxPath, _localEntropy);

            int  maxTryCnt = 10;
            bool converged = false;
            int  tryCnt    = 0;

            while (converged == false && tryCnt <= maxTryCnt)
            {
                tryCnt++;

                if (pesMaxScore >= optMaxScore)
                {
                    path      = pesMaxPath;
                    converged = true;
                }
                else
                {
                    path = optMaxPath;

                    // correct the estimation at step t
                    int          diffFrom = pesMaxPath.DifferentAt(optMaxPath);
                    HexaPath     subpath  = optMaxPath.SubPath(diffFrom, optMaxPath.Length - 1);
                    PlanningNode diffNode = graph[diffFrom].GetNode(subpath[0]);
                    int          diffIdx  = graph[diffFrom].GetIndex(diffNode);
                    HexaPath     prevPath = optMaxPath.SubPath(0, diffFrom - 1);
                    double[,] tempEntropy = (double[, ])(_localEntropy.Clone());
                    _agent.Update(prevPath, tempEntropy);
                    _pesEstimated.totalRewards[diffFrom][diffIdx] = _agent.Score(subpath, tempEntropy);

                    HexaPath newCandidatePath = new HexaPath();
                    newCandidatePath.AddPos(start);

                    HexaPath newSubCandidate = EstimatePath(graph, 1, start, _pesEstimated);
                    newCandidatePath.Merge(newSubCandidate);

                    double newCandidateScore = _agent.Score(newSubCandidate, _localEntropy);

                    if (newCandidateScore <= pesMaxScore)
                    {
                        converged = true;
                    }
                    else
                    {
                        pesMaxScore = newCandidateScore;
                        pesMaxPath  = newCandidatePath;
                    }
                }
            }

            return(path);
        }
        HexaPath EstimatePath(PathPlanningGraph graph, int currentLevel, HexaPos lastPos, Rewards reward)
        {
            HexaPath newpath  = new HexaPath();
            int      endLevel = graph.planningLength - 1;

            for (int t = currentLevel; t <= endLevel; t++)
            {
                PlanningNode        lastNode = graph[t - 1].GetNode(lastPos);
                List <PlanningEdge> edges    = graph[t - 1].GetEdges(lastNode);
                double  maxVal = -0.01;
                HexaPos maxPos = null;
                List <PlanningEdge> .Enumerator e = edges.GetEnumerator();
                while (e.MoveNext())
                {
                    int nextIdx = graph[t].GetIndex(e.Current.to);
                    if (reward.totalRewards[t][nextIdx] > maxVal)
                    {
                        maxPos = graph[t].mNodes[nextIdx].pos;
                        maxVal = reward.totalRewards[t][nextIdx];
                    }
                }
                newpath.AddPos(maxPos);
                lastPos = maxPos;
            }

            return(newpath);
        }