Beispiel #1
0
        /// <summary>
        /// 对环境做出评估
        /// Agent对环境有个好恶,该好恶与目标任务无关,只与它在环境中接收的奖励或惩罚有关
        ///
        /// </summary>
        /// <returns></returns>
        private double doEvaluateEnviornment(int time, PolicyState policyState)
        {
            var    obs = net.GetReceoptorSplit();
            Vector env = obs.env;


            //在记忆库中查找当前环境
            SceneItem sceneItem = this.scene.GetMatched(env);

            if (sceneItem != null)
            {
                policyState.AddEnviormentEvaluation(0, env, sceneItem.evaluation);
                return(sceneItem.evaluation);
            }

            double forcastReward = net.DoForcastReward(time, obs.gesture, env, 3, 0);

            policyState.AddEnviormentEvaluation(1, env, forcastReward);
            return(forcastReward);
        }
Beispiel #2
0
        public override ActionPlan Execute(int time, Session session)
        {
            //1.1 处理reward
            processReward(time);

            //取得当前姿态和最优姿态
            Vector curGesture    = net.GetReceptorGesture(null);
            Vector optimaGesture = Session.handleGetOptimaGesture(net, time);

            policyState               = new PolicyState();
            policyState.curGesture    = curGesture.clone();
            policyState.optimaGesture = optimaGesture.clone();

            //1.2 仍在随机动作阶段
            if (time < Session.GetConfiguration().learning.development.step)
            {
                ActionPlan actionPlan = ActionPlan.CreateRandomPlan(net, time, "random walk");
                policyState.policeText = actionPlan.judgeType;
                policyState.action     = actionPlan.actions[0];
                return(net.actionPlanChain.PutNext(actionPlan));
            }

            //1.3 初始化值差异方向感
            List <double> actions = null;
            Dictionary <Vector, Vector> actionToGesture = null;

            if (expectGesture == null)
            {
                expectGesture = optimaGesture.clone();
                actions       = net.doInference(time, expectGesture, out actionToGesture);
                policyState.objectiveGesture = expectGesture.clone();
                policyState.action           = actions[0];
                policyState.policeText       = "optima posture";
                return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "optima posture")));
            }

            //1.4计算当前环境状态下的评估
            double envEvaluation = doEvaluateEnviornment(time, policyState);

            //1.5当前姿态是最优姿态,且当前环境状态为正评估,执行维持动作,结束
            if (net.IsGestureInTolerateDistance(curGesture, optimaGesture) && envEvaluation >= 0)
            {
                optimaMaintainCount += 1;
                ActionPlan plan = ActionPlan.createMaintainPlan(net, time, "maintain optimal posture", envEvaluation, 0);
                policyState.action     = plan.actions[0];
                policyState.policeText = plan.judgeType;
                return(net.actionPlanChain.PutNext(plan));
            }
            if (optimaMaintainCount > 0)
            {
                optimaMaintainCount = 0;
                if (envEvaluation >= 0)
                {
                    actions                = net.doInference(time, optimaGesture, out actionToGesture);
                    policyState.action     = actions[0];
                    policyState.policeText = "adjust optimal posture";
                    return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "adjust optimal posture")));
                }
            }

            //1.6 计算偏离方向(如果与最优姿态出现偏离,计算偏离的方向)
            //对应距离来说,方向是指偏大偏小,对于角度来说,方向是指顺时针逆时针
            if (expectDirection == null)
            {
                List <Receptor>     gestureReceptors = net.GesturesReceptors;
                List <MeasureTools> measureTools     = net.GestureMeasureTools;
                expectDirection = new Vector(true, curGesture.Size);
                for (int i = 0; i < curGesture.Size; i++)
                {
                    if (double.IsNaN(optimaGesture[i]))
                    {
                        expectDirection[i] = 0;
                    }
                    else
                    {
                        expectDirection[i] = measureTools[i].getChangeDirection(curGesture[i], optimaGesture[i]);
                    }
                }
            }

            //1.6 当前环境是正评估或未知评估
            int    K = 1;
            Vector objectiveGesture = null;
            int    maintainSteps    = net.actionPlanChain.Length;

            if (envEvaluation >= 0)
            {
                //1.6.1维持小于K步,执行维持动作
                if (maintainSteps <= K)
                {
                    ActionPlan plan = ActionPlan.createMaintainPlan(net, time, "positive evaluation and maintenance ", envEvaluation, 0);
                    policyState.action     = plan.actions[0];
                    policyState.policeText = plan.judgeType;
                    return(net.actionPlanChain.PutNext(plan));
                }
                //1.6.2 当前环境是正评估,目标姿态为当前姿态向期望方向靠近
                else
                {
                    Vector tempCurGesture = curGesture;
                    while (true)
                    {
                        objectiveGesture = moveGesture(tempCurGesture, optimaGesture, expectDirection, 1);
                        actions          = net.doInference(time, objectiveGesture, out actionToGesture);
                        actions          = checkMaxActions(actions);
                        actions          = checkMove(actions, curGesture, objectiveGesture);
                        policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture);
                        policyState.policeText = "Expectation improvement after positive evaluation";
                        return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "Expectation improvement after positive evaluation")));
                    }
                }
            }
            //1.7 当前奖励是负,切换期望姿态
            objectiveGesture = moveGesture(curGesture, optimaGesture, expectDirection, -1);
            actions          = net.doInference(time, objectiveGesture, out actionToGesture);
            actions          = checkMaxActions(actions);
            if (actions[0] == 0.5)
            {
                if (expectDirection == 1)
                {
                    actions[0] -= 0.1;
                }
                else if (expectDirection == -1)
                {
                    actions[0] += 0.1;
                }
            }

            policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture);
            policyState.setGestureAction(envEvaluation, curGesture, objectiveGesture, actions[0], actionToGesture);
            policyState.policeText = "lower expectations after negative evaluation";

            //actions = checkMove(actions, curGesture, objectiveGesture);
            return(net.actionPlanChain.Reset(ActionPlan.CreateActionPlan(net, actions, time, ActionPlan.JUDGE_INFERENCE, "lower expectations after negative evaluation")));
        }