public void LearnPS(int N, int episodes) { subgoalVar = new List <double>(); forceStop = false; //int stateHold = currentState; float alpha = 0.2f, gamma = 0.95f, theta = 0.001f; int s = currentState; //Initialize List <PriorityObj> PQueue = new List <PriorityObj>(); visit = new Dictionary <int, int>(); int epsilon_counter = 0; int steps = 0; while (true) { if (forceStop) { break; } if (epsilon_counter++ % 100 == 0) { epsilon_counter = 1; e = 0.01 + 0.7 * Math.Pow(Math.E, -(discount++) / 20.0); } int tempSteps = steps; //Number of steps per episode if (EpisodeSteps.Count > 1500) { forceStop = true; } //(b) a <- policy(s,Q) //int a = e_greedy(Q[s]); int a = softmaxWVisits(Q[s]); // Q[s][a] -= -0.001; double tempVal = Q[s][a]; //tempFirstVisitHist[s[0], s[1]] += 1; //(c) Execute action a int sP = 0; double r = 0; Context curCtxt = currentContext; lock (threadLock) { executeAction(s, a, ref sP, ref r, ref steps); } if (!visit.ContainsKey(sP)) { visit.Add(sP, 0); } Context nexCtxt = currentContext; if (curCtxt != nexCtxt) { //double tmp = curCtxt.Q[s][Q[s].GetMaxAction()]; //for (int i = 0; i < curCtxt.Q[s].Actions.Count; i++) //{ // Q[s][i] += curCtxt.Q[s][i];//tmp / 8; //} continue; } //SwitchContext(curCtxt); int result = sP; //(d) Model(s,a) <- s', r //Model[s[0], s[1], a] = new int[] { sP[0], sP[1], r, steps - tempSteps }; if (Model[s, a] == null) { ModelValue m = new ModelValue(s, a, sP, r, steps - tempSteps); Model[s, a] = m; } else if (Model[s, a].calculateEm(sP, r) > 0) { Model[s, a].Update(s, a, sP, r, steps - tempSteps); } //System.Diagnostics.Debug.WriteLine("\tContext: " + currentContext.cID + "\t(s,a,sp): " + s + ", " + a + ", " + sP + "\tTm0: " /*+ (Model[s, a].Tm.Count > 0 ? Model[s,a].Tm[0].ToString() : " ") */ + "\tem0: " + em + "\tEm0: " + contexts[0].Em + "\tEm1: " + (contexts.Count > 1 ? contexts[1].Em.ToString() : "-")); //(e) p <- |r + gama*maxA'Q(s',a') - Q(s,a)| //float p = Math.Abs(r + gamma * getQ(sP[0], sP[1], maxA(sP)) - getQ(s[0], s[1], a)); double p = Math.Abs(r + gamma * Q[sP][Q[sP].GetMaxAction()] - Q[s][a]); //(f) if p > tetha, then insert s,a into PQueue with priority p PQueue.Clear(); if (p > theta) { InsertQueue(PQueue, s, a, p); } //(g) Repeat N times while PQueue is not empty for (int i = 0; i < N && 0 < PQueue.Count; i++) { //(-)s, a <- first(PQueue) PriorityObj obj = PQueue[0];// dene!!!!!!!!!!!!11 PQueue.Remove(obj); s = obj.State; a = obj.Action; //(-)s', r <- Modell(s,a) sP = Model[s, a].sP; r = Model[s, a].reward; int tsteps = Model[s, a].steps; //(-)Q(s, a) <- Q(s,a) + alpha[r + gama*maxA'Q(s',a') - Q(s,a)] //e_updates.Add(new double[] { s, a, Q[s][a] }); Q[s][a] = Q[s][a] + alpha * (r + Math.Pow(gamma, tsteps) * Q[sP][Q[sP].GetMaxAction()] - Q[s][a]); //float t = (float)(getQ(s[0], s[1], a) + alpha * (r + Math.Pow(gamma, tsteps) * getQ(sP[0], sP[1], maxA(sP)) - getQ(s[0], s[1], a))); //setQ(s[0], s[1], a, t); //(-)Repeat, for all s",a" predicted to lead to s List <ModelValue> list = Model.All_SA_PredictedToLeadTo(s); for (int j = 0; j < list.Count; j++) { int sDP = list[j].s; int aDP = list[j].a; // r" <- predicted reward double rDP = list[j].reward; // p <- |r" + gamma*maxaQ(s,a) - Q(s",a")| p = Math.Abs(rDP + gamma * Q[s][Q[s].GetMaxAction()] - Q[sDP][aDP]); // if p > tetha, then insert s",a" into PQueue with priority p if (p > theta) { InsertQueue(PQueue, sDP, aDP, p); } } } s = result; environment.State = s; if (environment.isGoal(s)) { //visit = new Dictionary<int, int>();/////////// //if (subgoalValues != null) //{ // subgoalValues[s] = 0.1; // for (int f = 0; f < 4; f++) // { // if (Model.StateCount < s) // Model.States.Add(new ModelState(s)); // if (Model[s] == null) // Model[s] = new ModelState(s); // if(Model[s].Count <= 4) // Model[s].Add(new ModelValue(s, f, s, 0, 0)); // } //} if (Model.States.Count <= s) { Model[s, a] = null; Model.States[s] = new ModelState(s); } //e = 0.9 * Math.Pow(Math.E, -(discount++) / 50.0); //Update epsilon if (!DisableOptions && !m_optsLearned) { //initiationSetMembers = new int[Model.StateCount]; //initiationSets = new List<InitiationSet>(); Subgoals = SearchSubgoals(); //Subgoals.Add(new SubGoal(s)); SubgoalCounts.Add(Subgoals.Count); double mean = 0; double std = 100; int sc = 10; if (SubgoalCounts.Count > sc) { std = 0; for (int c = SubgoalCounts.Count - sc; c < SubgoalCounts.Count; c++) { mean += SubgoalCounts[c] / (double)sc; } for (int c = SubgoalCounts.Count - sc; c < SubgoalCounts.Count; c++) { std += Math.Pow(SubgoalCounts[c] - mean, 2) / sc; } subgoalVar.Add(std); std = Math.Sqrt(std); } else { subgoalVar.Add(-1); } if (std < 0.01) { List <OptionN> options = createOptFromSubGoals(Subgoals); currentContext.options = options; optNonLearnedVals = 0; optLearnedVals = 0; for (int i = 0; i < Model.StateCount; i++) { if (Model[i] == null) { optNonLearnedVals += 4; } else { for (int j = 0; j < Model[i].Count; j++) { if (Model[i][j] == null || Q[i][j] == 0) { optNonLearnedVals += 1; } else { optLearnedVals += 1; } } } } //SearchInitiationSets(Subgoals); //Bu satır if in içinde olmalı (ki öyle(değil mi?)) //List<Option> options = CreateOptions(); OptionLearn(options); //currentContext.options = options; } } s = currentState; EpisodeSteps.Add(steps); ContextTrack.Add(currentContext.cID); currentContext.EpisodeSteps.Add(steps); steps = 0; int calcLength = 9; if (currentContext.EpisodeSteps.Count > calcLength) { double mean = 0; double std = 0; for (int c = currentContext.EpisodeSteps.Count - calcLength; c < currentContext.EpisodeSteps.Count; c++) { mean += EpisodeSteps[c] / (double)calcLength; } for (int c = currentContext.EpisodeSteps.Count - calcLength; c < currentContext.EpisodeSteps.Count; c++) { std += Math.Pow(currentContext.EpisodeSteps[c] - mean, 2) / calcLength; } std = Math.Sqrt(std); currentContext.std = std; bool stop = true; for (int cx = 0; cx < contexts.Count; cx++) { if (contexts[cx].std > 10) { stop = false; } //System.Diagnostics.Debug.Write(cx + ":" + contexts[cx].std + " "); } //System.Diagnostics.Debug.Write("\n"); if (stopByEpisode) { if (stopEpisode <= EpisodeSteps.Count) { break; } } else { if (stop || forceStop) { break; } } }//Varyans ile dene //if (!m_optsLearned) //{ // OptionSearch(); // OptionLearn(); //} PQueue.Clear(); if (curCtxt != nexCtxt) { SwitchContext(nexCtxt); } if (oneStepLearn) { break; } e_updates = new List <double[]>(); } } environment.State = currentState; }