コード例 #1
0
    public void LearnPS(int N, int episodes)
    {
        subgoalVar = new List <double>();
        forceStop  = false;
        //int stateHold = currentState;

        float alpha = 0.2f, gamma = 0.95f, theta = 0.001f;
        int   s = currentState; //Initialize

        List <PriorityObj> PQueue = new List <PriorityObj>();

        visit = new Dictionary <int, int>();
        int epsilon_counter = 0;
        int steps           = 0;

        while (true)
        {
            if (forceStop)
            {
                break;
            }

            if (epsilon_counter++ % 100 == 0)
            {
                epsilon_counter = 1;
                e = 0.01 + 0.7 * Math.Pow(Math.E, -(discount++) / 20.0);
            }

            int tempSteps = steps; //Number of steps per episode

            if (EpisodeSteps.Count > 1500)
            {
                forceStop = true;
            }

            //(b) a <- policy(s,Q)
            //int a = e_greedy(Q[s]);
            int a = softmaxWVisits(Q[s]);
            // Q[s][a] -= -0.001;
            double tempVal = Q[s][a];
            //tempFirstVisitHist[s[0], s[1]] += 1;
            //(c) Execute action a
            int     sP      = 0;
            double  r       = 0;
            Context curCtxt = currentContext;
            lock (threadLock)
            {
                executeAction(s, a, ref sP, ref r, ref steps);
            }
            if (!visit.ContainsKey(sP))
            {
                visit.Add(sP, 0);
            }
            Context nexCtxt = currentContext;
            if (curCtxt != nexCtxt)
            {
                //double tmp = curCtxt.Q[s][Q[s].GetMaxAction()];
                //for (int i = 0; i < curCtxt.Q[s].Actions.Count; i++)
                //{
                //    Q[s][i] += curCtxt.Q[s][i];//tmp / 8;
                //}
                continue;
            }
            //SwitchContext(curCtxt);
            int result = sP;

            //(d) Model(s,a) <- s', r
            //Model[s[0], s[1], a] = new int[] { sP[0], sP[1], r, steps - tempSteps };

            if (Model[s, a] == null)
            {
                ModelValue m = new ModelValue(s, a, sP, r, steps - tempSteps);
                Model[s, a] = m;
            }
            else
            if (Model[s, a].calculateEm(sP, r) > 0)
            {
                Model[s, a].Update(s, a, sP, r, steps - tempSteps);
            }
            //System.Diagnostics.Debug.WriteLine("\tContext: " + currentContext.cID + "\t(s,a,sp): " + s + ", " + a + ", " + sP + "\tTm0: " /*+ (Model[s, a].Tm.Count > 0 ? Model[s,a].Tm[0].ToString() : " ") */ + "\tem0: " + em + "\tEm0: " + contexts[0].Em + "\tEm1: " + (contexts.Count > 1 ? contexts[1].Em.ToString() : "-"));

            //(e) p <- |r + gama*maxA'Q(s',a') - Q(s,a)|
            //float p = Math.Abs(r + gamma * getQ(sP[0], sP[1], maxA(sP)) - getQ(s[0], s[1], a));
            double p = Math.Abs(r + gamma * Q[sP][Q[sP].GetMaxAction()] - Q[s][a]);

            //(f) if p > tetha, then insert s,a into PQueue with priority p
            PQueue.Clear();
            if (p > theta)
            {
                InsertQueue(PQueue, s, a, p);
            }

            //(g) Repeat N times while PQueue is not empty
            for (int i = 0; i < N && 0 < PQueue.Count; i++)
            {
                //(-)s, a <- first(PQueue)
                PriorityObj obj = PQueue[0];// dene!!!!!!!!!!!!11
                PQueue.Remove(obj);
                s = obj.State;
                a = obj.Action;

                //(-)s', r <- Modell(s,a)
                sP = Model[s, a].sP;
                r  = Model[s, a].reward;
                int tsteps = Model[s, a].steps;

                //(-)Q(s, a) <- Q(s,a) + alpha[r + gama*maxA'Q(s',a') - Q(s,a)]
                //e_updates.Add(new double[] { s, a, Q[s][a] });
                Q[s][a] = Q[s][a] + alpha * (r + Math.Pow(gamma, tsteps) * Q[sP][Q[sP].GetMaxAction()] - Q[s][a]);

                //float t = (float)(getQ(s[0], s[1], a) + alpha * (r + Math.Pow(gamma, tsteps) * getQ(sP[0], sP[1], maxA(sP)) - getQ(s[0], s[1], a)));
                //setQ(s[0], s[1], a, t);

                //(-)Repeat, for all s",a" predicted to lead to s
                List <ModelValue> list = Model.All_SA_PredictedToLeadTo(s);
                for (int j = 0; j < list.Count; j++)
                {
                    int sDP = list[j].s;
                    int aDP = list[j].a;

                    // r" <- predicted reward
                    double rDP = list[j].reward;

                    // p <- |r" + gamma*maxaQ(s,a) - Q(s",a")|
                    p = Math.Abs(rDP + gamma * Q[s][Q[s].GetMaxAction()] - Q[sDP][aDP]);

                    // if p > tetha, then insert s",a" into PQueue with priority p
                    if (p > theta)
                    {
                        InsertQueue(PQueue, sDP, aDP, p);
                    }
                }
            }
            s = result;
            environment.State = s;
            if (environment.isGoal(s))
            {
                //visit = new Dictionary<int, int>();///////////
                //if (subgoalValues != null)
                //{
                //    subgoalValues[s] = 0.1;
                //    for (int f = 0; f < 4; f++)
                //    {
                //        if (Model.StateCount < s)
                //            Model.States.Add(new ModelState(s));
                //        if (Model[s] == null)
                //            Model[s] = new ModelState(s);
                //        if(Model[s].Count <= 4)
                //            Model[s].Add(new ModelValue(s, f, s, 0, 0));
                //    }
                //}
                if (Model.States.Count <= s)
                {
                    Model[s, a]     = null;
                    Model.States[s] = new ModelState(s);
                }
                //e = 0.9 * Math.Pow(Math.E, -(discount++) / 50.0); //Update epsilon
                if (!DisableOptions && !m_optsLearned)
                {
                    //initiationSetMembers = new int[Model.StateCount];
                    //initiationSets = new List<InitiationSet>();
                    Subgoals = SearchSubgoals();
                    //Subgoals.Add(new SubGoal(s));
                    SubgoalCounts.Add(Subgoals.Count);
                    double mean = 0;
                    double std  = 100;
                    int    sc   = 10;
                    if (SubgoalCounts.Count > sc)
                    {
                        std = 0;
                        for (int c = SubgoalCounts.Count - sc; c < SubgoalCounts.Count; c++)
                        {
                            mean += SubgoalCounts[c] / (double)sc;
                        }
                        for (int c = SubgoalCounts.Count - sc; c < SubgoalCounts.Count; c++)
                        {
                            std += Math.Pow(SubgoalCounts[c] - mean, 2) / sc;
                        }
                        subgoalVar.Add(std);
                        std = Math.Sqrt(std);
                    }
                    else
                    {
                        subgoalVar.Add(-1);
                    }


                    if (std < 0.01)
                    {
                        List <OptionN> options = createOptFromSubGoals(Subgoals);
                        currentContext.options = options;
                        optNonLearnedVals      = 0;
                        optLearnedVals         = 0;
                        for (int i = 0; i < Model.StateCount; i++)
                        {
                            if (Model[i] == null)
                            {
                                optNonLearnedVals += 4;
                            }
                            else
                            {
                                for (int j = 0; j < Model[i].Count; j++)
                                {
                                    if (Model[i][j] == null || Q[i][j] == 0)
                                    {
                                        optNonLearnedVals += 1;
                                    }
                                    else
                                    {
                                        optLearnedVals += 1;
                                    }
                                }
                            }
                        }
                        //SearchInitiationSets(Subgoals); //Bu satır if in içinde olmalı (ki öyle(değil mi?))
                        //List<Option> options = CreateOptions();
                        OptionLearn(options);
                        //currentContext.options = options;
                    }
                }

                s = currentState;
                EpisodeSteps.Add(steps);
                ContextTrack.Add(currentContext.cID);
                currentContext.EpisodeSteps.Add(steps);
                steps = 0;
                int calcLength = 9;
                if (currentContext.EpisodeSteps.Count > calcLength)
                {
                    double mean = 0;
                    double std  = 0;
                    for (int c = currentContext.EpisodeSteps.Count - calcLength; c < currentContext.EpisodeSteps.Count; c++)
                    {
                        mean += EpisodeSteps[c] / (double)calcLength;
                    }
                    for (int c = currentContext.EpisodeSteps.Count - calcLength; c < currentContext.EpisodeSteps.Count; c++)
                    {
                        std += Math.Pow(currentContext.EpisodeSteps[c] - mean, 2) / calcLength;
                    }
                    std = Math.Sqrt(std);
                    currentContext.std = std;

                    bool stop = true;
                    for (int cx = 0; cx < contexts.Count; cx++)
                    {
                        if (contexts[cx].std > 10)
                        {
                            stop = false;
                        }
                        //System.Diagnostics.Debug.Write(cx + ":" + contexts[cx].std + "   ");
                    }
                    //System.Diagnostics.Debug.Write("\n");
                    if (stopByEpisode)
                    {
                        if (stopEpisode <= EpisodeSteps.Count)
                        {
                            break;
                        }
                    }
                    else
                    {
                        if (stop || forceStop)
                        {
                            break;
                        }
                    }
                }//Varyans ile dene

                //if (!m_optsLearned)
                //{
                //    OptionSearch();
                //    OptionLearn();
                //}
                PQueue.Clear();
                if (curCtxt != nexCtxt)
                {
                    SwitchContext(nexCtxt);
                }
                if (oneStepLearn)
                {
                    break;
                }
                e_updates = new List <double[]>();
            }
        }
        environment.State = currentState;
    }