/// <summary> /// 对环境做出评估 /// Agent对环境有个好恶,该好恶与目标任务无关,只与它在环境中接收的奖励或惩罚有关 /// /// </summary> /// <returns></returns> private double doEvaluateEnviornment(int time, PolicyState policyState) { var obs = net.GetReceoptorSplit(); Vector env = obs.env; //在记忆库中查找当前环境 SceneItem sceneItem = this.scene.GetMatched(env); if (sceneItem != null) { policyState.AddEnviormentEvaluation(0, env, sceneItem.evaluation); return(sceneItem.evaluation); } double forcastReward = net.DoForcastReward(time, obs.gesture, env, 3, 0); policyState.AddEnviormentEvaluation(1, env, forcastReward); return(forcastReward); }
public override ActionPlan Execute(int time, Session session) { //1.1 处理reward processReward(time); //取得当前姿态和最优姿态 Vector curGesture = net.GetReceptorGesture(null); Vector optimaGesture = Session.handleGetOptimaGesture(net, time); policyState = new PolicyState(); policyState.curGesture = curGesture.clone(); policyState.optimaGesture = optimaGesture.clone(); //1.2 仍在随机动作阶段 if (time < Session.GetConfiguration().learning.development.step) { ActionPlan actionPlan = ActionPlan.CreateRandomPlan(net, time, "random walk"); policyState.policeText = actionPlan.judgeType; policyState.action = actionPlan.actions[0]; return(net.actionPlanChain.PutNext(actionPlan)); } //1.3 初始化值差异方向感 List <double> actions = null; Dictionary <Vector, Vector> actionToGesture = null; if (expectGesture == null) { expectGesture = optimaGesture.clone(); actions = net.doInference(time, expectGesture, out actionToGesture); policyState.objectiveGesture = expectGesture.clone(); policyState.action = actions[0]; policyState.policeText = "optima posture"; return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "optima posture"))); } //1.4计算当前环境状态下的评估 double envEvaluation = doEvaluateEnviornment(time, policyState); //1.5当前姿态是最优姿态,且当前环境状态为正评估,执行维持动作,结束 if (net.IsGestureInTolerateDistance(curGesture, optimaGesture) && envEvaluation >= 0) { optimaMaintainCount += 1; ActionPlan plan = ActionPlan.createMaintainPlan(net, time, "maintain optimal posture", envEvaluation, 0); policyState.action = plan.actions[0]; policyState.policeText = plan.judgeType; return(net.actionPlanChain.PutNext(plan)); } if (optimaMaintainCount > 0) { optimaMaintainCount = 0; if (envEvaluation >= 0) { actions = net.doInference(time, optimaGesture, out actionToGesture); policyState.action = actions[0]; policyState.policeText = "adjust optimal posture"; return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "adjust optimal posture"))); } } //1.6 计算偏离方向(如果与最优姿态出现偏离,计算偏离的方向) //对应距离来说,方向是指偏大偏小,对于角度来说,方向是指顺时针逆时针 if (expectDirection == null) { List <Receptor> gestureReceptors = net.GesturesReceptors; List <MeasureTools> measureTools = net.GestureMeasureTools; expectDirection = new Vector(true, curGesture.Size); for (int i = 0; i < curGesture.Size; i++) { if (double.IsNaN(optimaGesture[i])) { expectDirection[i] = 0; } else { expectDirection[i] = measureTools[i].getChangeDirection(curGesture[i], optimaGesture[i]); } } } //1.6 当前环境是正评估或未知评估 int K = 1; Vector objectiveGesture = null; int maintainSteps = net.actionPlanChain.Length; if (envEvaluation >= 0) { //1.6.1维持小于K步,执行维持动作 if (maintainSteps <= K) { ActionPlan plan = ActionPlan.createMaintainPlan(net, time, "positive evaluation and maintenance ", envEvaluation, 0); policyState.action = plan.actions[0]; policyState.policeText = plan.judgeType; return(net.actionPlanChain.PutNext(plan)); } //1.6.2 当前环境是正评估,目标姿态为当前姿态向期望方向靠近 else { Vector tempCurGesture = curGesture; while (true) { objectiveGesture = moveGesture(tempCurGesture, optimaGesture, expectDirection, 1); actions = net.doInference(time, objectiveGesture, out actionToGesture); actions = checkMaxActions(actions); actions = checkMove(actions, curGesture, objectiveGesture); policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture); policyState.policeText = "Expectation improvement after positive evaluation"; return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "Expectation improvement after positive evaluation"))); } } } //1.7 当前奖励是负,切换期望姿态 objectiveGesture = moveGesture(curGesture, optimaGesture, expectDirection, -1); actions = net.doInference(time, objectiveGesture, out actionToGesture); actions = checkMaxActions(actions); if (actions[0] == 0.5) { if (expectDirection == 1) { actions[0] -= 0.1; } else if (expectDirection == -1) { actions[0] += 0.1; } } policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture); policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture); policyState.policeText = "lower expectations after negative evaluation"; //actions = checkMove(actions, curGesture, objectiveGesture); return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "lower expectations after negative evaluation"))); }