private double[] computeEvaluationDensity(List <ActionPlan> plans) { double[] density = new double[plans.Count]; int inferencecount = Session.GetConfiguration().evaluation.policy.init_plan_depth; for (int i = 0; i < plans.Count; i++) { List <double> values = new List <double>(); List <Vector> obs = net.GetMergeReceptorValues(plans[i].observation, plans[i].actions); for (int j = 0; j < inferencecount; j++) { obs = net.forward_inference(obs); if (obs == null) { break; } ObservationHistory.Scene scene = net.actionMemory.FindMatchedScene(obs).Item1; if (scene == null) { continue; } if (double.IsNaN(scene.density)) { continue; } values.Add(scene.density); } density[i] = values.Count <= 0 ? -1 * inferencecount : values.Average(); } return(density); }
/// <summary> /// 制订新的行动计划 /// </summary> /// <param name="time"></param> /// <param name="session"></param> /// <param name="reward"></param> /// <param name="policyConfig"></param> /// <returns></returns> private ActionPlan makeNewActionPlan(int time, Session session) { //取得与当前场景匹配的所有行动计划 var s = net.actionMemory.FindMatchedScene(); ObservationHistory.Scene scene = s.Item1; List <ObservationHistory.ActionRecord> actionRecords = scene == null ? new List <ObservationHistory.ActionRecord>() : scene.records; //如果行动计划不是全部,补齐全部可能的行动计划,且按照与本能行动一致的顺序排序 List <ActionPlan> plans = checkActionPlansFull(actionRecords, time); //找到本能行动计划和维持行动计划 List <double> instictAction = Session.instinctActionHandler(net, time); ActionPlan instinctPlan = plans.FirstOrDefault(p => p.actions[0] == instictAction[0]); ActionPlan maintainPlan = plans.FirstOrDefault(p => p.actions[0] == 0.5); //遍历所有的计划 double[] forcast = new double[plans.Count]; for (int i = 0; i < plans.Count; i++) { //上次是负奖励,则维持行动没有必要 if (net.reward < 0 && plans[i].IsMaintainAction()) { continue; } //如果第i个行动确定是正评估,就是它了 if (plans[i].evaulation > 0) { plans[i].reason = "走向正评估"; plans[i].planSteps = (int)plans[i].evaulation + policyConfig.init_plan_depth; //forcast[i] = forcastActionPlan(scene!=null?scene.scene:plans[i].inputObs, plans[i].actions); //if (forcast[i] < 0) continue; //plans[i].expect = forcast[i]; return(plans[i]); } //如果第i个行动是未知评估,就是它了 else if (double.IsNaN(plans[i].evaulation)) { plans[i].reason = "探索未知"; plans[i].planSteps = policyConfig.init_plan_depth; //forcast[i] = forcastActionPlan(scene != null ? scene.scene : plans[i].inputObs, plans[i].actions); //if (forcast[i] < 0) continue; //plans[i].expect = forcast[i]; return(plans[i]); } else { forcast[i] = plans[i].evaulation; } } //执行到这里,说明所有的评估都是负评估了. //处理起来非常麻烦,因为奖励与目标点无关,使得根据迷宫不同,有时候应该向本能放下走,有时候不能。 //1.要么修改奖励计算方法,在保留碰撞负奖励以外,让沿着路线走的方向获得的奖励大些,这样在这里就简单了,直接选择记忆场景中奖励最大的行为 //2 麻烦的办法是用推理测试,沿着每个方向走的时候,其后面的每一步在本能附近方向未知或者碰撞距离较大,但是这种做法计算太复杂 //3 计算每个行动记忆场景的奖励密度。按照优先级选择碰撞密度最小的前三个 /*double[] evaluationDensity = computeEvaluationDensity(plans); * for(int i=0;i<plans.Count;i++) * { * //负奖励表示已经碰撞,维持行动一定不可取 * if (net.reward < 0 && plans[i].actions[0] == 0.5) continue; * //碰撞后维持行动附近的行动也不可取 * if (net.reward < 0 && Math.Abs(plans[i].actions[0] - 0.5) < 0.25) * continue; * //计算是第几个碰撞密度 * int c = evaluationDensity.ToList().Count(d => d <= evaluationDensity[i]); * if(c <=3) * { * plans[i].reason = "最小评估密度("+ evaluationDensity[i].ToString("F3")+")"; * plans[i].planSteps = -1 * (int)plans[i].evaulation / 2; * return plans[i]; * } * * }*/ //下面应该不会发生 List <ActionPlan> ps = plans; if (net.reward < 0) { ps = plans.FindAll(p => Math.Abs(p.actions[0] - 0.5) >= 0.25); } int t = Network.rng.Next(0, ps.Count); ps[t].reason = "全部负面,随机选择"; ps[t].planSteps = double.IsNaN(ps[t].evaulation)?16:-1 * (int)ps[t].evaulation / 2; return(plans[t]); }