/// <summary> /// 生成可以测试的动作计划集:从动作记忆中找到的行动计划,加上新补充的一些 /// </summary> /// <param name="plans">从动作记忆中找到的行动计划</param> /// <param name="time"></param> /// <returns></returns> private List <ActionPlan> checkActionPlansFull(List <ActionPlan> plans, int time) { if (plans == null) { plans = new List <ActionPlan>(); } List <List <double> > actionSets = CreateTestActionSet(Session.instinctActionHandler(net, time)); ActionPlan[] r = new ActionPlan[actionSets.Count]; for (int i = 0; i < actionSets.Count; i++) { ActionPlan plan = plans.FirstOrDefault(p => p.Equals(actionSets[i])); if (plan == null) { String judgeType = ActionPlan.JUDGE_INFERENCE; if (i == 0) { judgeType = ActionPlan.JUDGE_INSTINCT; } else if (actionSets[i][0] == 0.5) { judgeType = ActionPlan.JUDGE_MAINTAIN; } plan = ActionPlan.CreateActionPlan(net, actionSets[i], time, judgeType, "", 0); } r[i] = plan; } return(r.ToList()); }
/// <summary> /// 生成可以测试的动作计划集:从动作记忆中找到的行动计划,加上新补充的一些 /// </summary> /// <param name="plans">从动作记忆中找到的行动计划</param> /// <param name="time"></param> /// <returns></returns> private List <ActionPlan> checkActionPlansFull(List <ObservationHistory.ActionRecord> actionRecords, int time) { List <List <double> > actionSets = CreateTestActionSet(Session.instinctActionHandler(net, time)); ActionPlan[] r = new ActionPlan[actionSets.Count]; for (int i = 0; i < actionSets.Count; i++) { ActionPlan plan = null; String judgeType = ActionPlan.JUDGE_INFERENCE; if (i == 0) { judgeType = ActionPlan.JUDGE_INSTINCT; } else if (actionSets[i][0] == 0.5) { judgeType = ActionPlan.JUDGE_MAINTAIN; } ObservationHistory.ActionRecord record = actionRecords.FirstOrDefault(p => p.actions[0] == actionSets[i][0]); if (record == null) { plan = ActionPlan.CreateActionPlan(net, actionSets[i], time, judgeType, ""); } else { plan = ActionPlan.CreateActionPlan(net, record.actions, time, judgeType, ""); plan.evaulation = record.evaluation; } r[i] = plan; } return(r.ToList()); }
/// <summary> /// 1.处理reward /// 1.1 将当前环境场景放入 /// 2.生成测试行动集 /// 3.对每一个行动,计算评估值 /// 4.选择评估值最大行动 /// </summary> /// <param name="time"></param> /// <param name="session"></param> /// <returns></returns> public override ActionPlan execute(int time, Session session) { //1.1 处理reward processReward(time); //1.2 仍在随机动作阶段 if (time < 1) { return(net.actionPlanChain.PutNext(ActionPlan.CreateRandomPlan(net, time, "随机漫游"))); } if (net.reward > 0 && net.actionPlanChain.Length > 0 && net.actionPlanChain.Root.planSteps > net.actionPlanChain.Length) { return(net.actionPlanChain.PutNext(ActionPlan.createMaintainPlan(net, time, "maintain", 0, net.actionPlanChain.Last.planSteps - 1))); } //1.3 创建测试动作集 List <double> instinctAction = Session.instinctActionHandler(net, time); List <double> maintainAction = ActionPlan.MaintainAction; List <List <double> > testActionSet = this.CreateTestActionSet(instinctAction); //如果碰撞了,将维持动作删除去 if (net.reward <= 0) { int mindex = testActionSet.IndexOf(a => a[0] == 0.5); if (mindex >= 0) { testActionSet.RemoveAt(mindex); } } //1.4 寻找每个测试动作的匹配记录,计算所有匹配记录的均值 double[] evaluations = new double[testActionSet.Count]; List <InferenceRecord>[] rs = new List <InferenceRecord> [testActionSet.Count]; for (int i = 0; i < evaluations.Length; i++) { evaluations[i] = double.NaN; } List <Vector> envValues = net.GetReceptorSceneValues(); for (int i = 0; i < testActionSet.Count; i++) { List <InferenceRecord> records = net.GetMatchInfRecords(envValues, testActionSet[i], time); rs[i] = new List <InferenceRecord>(records); List <InferenceRecord> evaluatedRecords = records.FindAll(r => !double.IsNaN(r.evulation)); if (evaluatedRecords != null && evaluatedRecords.Count > 0) { evaluations[i] = evaluatedRecords.ConvertAll(r => r.evulation).Average(); } } for (int i = 0; i < testActionSet.Count; i++) { if (this.net.reward < -1.0 && testActionSet[i][0] == 0.5) { evaluations[i] = double.MinValue; } if (testActionSet[i][0] == 0.0 && net.actionPlanChain.Length >= 2 && net.actionPlanChain.ToList()[net.actionPlanChain.Length - 1].actions[0] == 0 && net.actionPlanChain.ToList()[net.actionPlanChain.Length - 2].actions[0] == 0) { evaluations[i] = double.MinValue; } } ActionPlan plan = null; //寻找是否有为评估的,优先评估它 int index = -1; for (int i = 0; i < evaluations.Length; i++) { if (double.IsNaN(evaluations[i])) { index = i; break; } } if (index >= 0) { plan = net.actionPlanChain.PutNext(ActionPlan.CreateActionPlan(net, testActionSet[index], time, ActionPlan.JUDGE_INFERENCE, "未知评估")); if (rs[index] != null && rs[index].Count > 0) { plan.inferenceRecords = rs[index].ConvertAll(r => (r, 0.0)); } plan.evaulation = evaluations[index]; if (plan.actions[0] == instinctAction[0]) { plan.planSteps = 0; } else { plan.planSteps = 0; } return(plan); } index = evaluations.ToList().IndexOf(evaluations.Max()); plan = net.actionPlanChain.PutNext(ActionPlan.CreateActionPlan(net, testActionSet[index], time, ActionPlan.JUDGE_INFERENCE, "最大评估")); if (rs[index] != null && rs[index].Count > 0) { plan.inferenceRecords = rs[index].ConvertAll(r => (r, 0.0)); } plan.evaulation = evaluations[index]; if (plan.actions[0] == instinctAction[0]) { plan.planSteps = 0; } else { plan.planSteps = 0; } return(plan); }
public override ActionPlan Execute(int time, Session session) { //1.1 处理reward processReward(time); //取得当前姿态和最优姿态 Vector curGesture = net.GetReceptorGesture(null); Vector optimaGesture = Session.handleGetOptimaGesture(net, time); policyState = new PolicyState(); policyState.curGesture = curGesture.clone(); policyState.optimaGesture = optimaGesture.clone(); //1.2 仍在随机动作阶段 if (time < Session.GetConfiguration().learning.development.step) { ActionPlan actionPlan = ActionPlan.CreateRandomPlan(net, time, "random walk"); policyState.policeText = actionPlan.judgeType; policyState.action = actionPlan.actions[0]; return(net.actionPlanChain.PutNext(actionPlan)); } //1.3 初始化值差异方向感 List <double> actions = null; Dictionary <Vector, Vector> actionToGesture = null; if (expectGesture == null) { expectGesture = optimaGesture.clone(); actions = net.doInference(time, expectGesture, out actionToGesture); policyState.objectiveGesture = expectGesture.clone(); policyState.action = actions[0]; policyState.policeText = "optima posture"; return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "optima posture"))); } //1.4计算当前环境状态下的评估 double envEvaluation = doEvaluateEnviornment(time, policyState); //1.5当前姿态是最优姿态,且当前环境状态为正评估,执行维持动作,结束 if (net.IsGestureInTolerateDistance(curGesture, optimaGesture) && envEvaluation >= 0) { optimaMaintainCount += 1; ActionPlan plan = ActionPlan.createMaintainPlan(net, time, "maintain optimal posture", envEvaluation, 0); policyState.action = plan.actions[0]; policyState.policeText = plan.judgeType; return(net.actionPlanChain.PutNext(plan)); } if (optimaMaintainCount > 0) { optimaMaintainCount = 0; if (envEvaluation >= 0) { actions = net.doInference(time, optimaGesture, out actionToGesture); policyState.action = actions[0]; policyState.policeText = "adjust optimal posture"; return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "adjust optimal posture"))); } } //1.6 计算偏离方向(如果与最优姿态出现偏离,计算偏离的方向) //对应距离来说,方向是指偏大偏小,对于角度来说,方向是指顺时针逆时针 if (expectDirection == null) { List <Receptor> gestureReceptors = net.GesturesReceptors; List <MeasureTools> measureTools = net.GestureMeasureTools; expectDirection = new Vector(true, curGesture.Size); for (int i = 0; i < curGesture.Size; i++) { if (double.IsNaN(optimaGesture[i])) { expectDirection[i] = 0; } else { expectDirection[i] = measureTools[i].getChangeDirection(curGesture[i], optimaGesture[i]); } } } //1.6 当前环境是正评估或未知评估 int K = 1; Vector objectiveGesture = null; int maintainSteps = net.actionPlanChain.Length; if (envEvaluation >= 0) { //1.6.1维持小于K步,执行维持动作 if (maintainSteps <= K) { ActionPlan plan = ActionPlan.createMaintainPlan(net, time, "positive evaluation and maintenance ", envEvaluation, 0); policyState.action = plan.actions[0]; policyState.policeText = plan.judgeType; return(net.actionPlanChain.PutNext(plan)); } //1.6.2 当前环境是正评估,目标姿态为当前姿态向期望方向靠近 else { Vector tempCurGesture = curGesture; while (true) { objectiveGesture = moveGesture(tempCurGesture, optimaGesture, expectDirection, 1); actions = net.doInference(time, objectiveGesture, out actionToGesture); actions = checkMaxActions(actions); actions = checkMove(actions, curGesture, objectiveGesture); policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture); policyState.policeText = "Expectation improvement after positive evaluation"; return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "Expectation improvement after positive evaluation"))); } } } //1.7 当前奖励是负,切换期望姿态 objectiveGesture = moveGesture(curGesture, optimaGesture, expectDirection, -1); actions = net.doInference(time, objectiveGesture, out actionToGesture); actions = checkMaxActions(actions); if (actions[0] == 0.5) { if (expectDirection == 1) { actions[0] -= 0.1; } else if (expectDirection == -1) { actions[0] += 0.1; } } policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture); policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture); policyState.policeText = "lower expectations after negative evaluation"; //actions = checkMove(actions, curGesture, objectiveGesture); return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "lower expectations after negative evaluation"))); }