public StateRewardPair getStateRewardPair(StateActionPair saPair)
 {
     if (!_modelTable.ContainsKey(saPair))
     {
         // If the StateRewardPair is not yet in the table,
         // it has to be saved.
         // Therefore the possible next following state has to be saevd in a new
         // StateRewardPair.
         // To do so, all possible Movements for the next state have to extracted
         // -> new method in StateExtractor, that retrieves the possible Movements
         // for any State or Position.
         // Independently of the mower!
         _modelTable[saPair] = new StateRewardPair(saPair.state, _initVal);
     }
     return(_modelTable[saPair]);
 }
Esempio n. 2
0
    /// <summary>
    /// Learn using the observed reward.
    /// </summary>
    /// <param name="reward">The observed reward.</param>
    public override void Learn(float reward)
    {
        if (_showParamLabel)
        {
            _receivers[0].InitialValues(Greediness, DiscountValue, LearnRate, InitialQValue,
                                        Run_with_etraces, Gamma, ModelPlanning, Refined, N);
        }
        // Old state
        StateActionPair oldSAP    = new StateActionPair(_lastState, _lastAction);
        float           oldQvalue = _qTable.getQValue(oldSAP);

        // Current state
        State           currentState      = base.CurrentState;
        MovementAction  bestCurrentAction = _qTable.getBestActionForState(currentState);
        StateActionPair currentSAP        = new StateActionPair(currentState, bestCurrentAction);
        float           bestCurrentQValue = _qTable.getQValue(currentSAP);

        if (Run_with_etraces)
        {
            float delta = reward + DiscountValue * bestCurrentQValue - oldQvalue;
            _eTable.SetEligibilityValue(oldSAP, 1f);
            _qTable.AddScaledValues(LearnRate * delta, _eTable);
            _eTable.ScaleAllEligibilityValues(DiscountValue * Gamma);
        }
        else
        {
            // Standard QLearning
            float newQValue = Mathf.Lerp(oldQvalue, reward + (DiscountValue * bestCurrentQValue), LearnRate);
            _qTable.setQValue(oldSAP, newQValue);
        }
        // Refined Model
        if (ModelPlanning && Refined)
        {
            // Update the model according to the observed state
            _mTable.IncorporateObservedState(currentState);

            State          virtualFromState, virtualToState;
            MovementAction virtualPerformedAction;
            float          virtualReward;
            // Perfrom N virtual steps
            for (int i = 0; i < N; i++)
            {
                // Generated the virtual step
                bool virtualStepGenerated = _mTable.GenerateRandomModelStep(out virtualFromState, out virtualPerformedAction, out virtualToState, out virtualReward);
                if (virtualStepGenerated)
                {
                    StateActionPair virtualFromSAP = new StateActionPair(virtualFromState, virtualPerformedAction);
                    // Standard QLearning
                    float fromStateQVal = _qTable.getQValue(virtualFromSAP);
                    // Get the best action after the virtual step
                    MovementAction  bestAction   = _qTable.getBestActionForState(virtualToState);
                    StateActionPair virtualToSAP = new StateActionPair(virtualToState, bestAction);
                    // Q value update for the virtual step
                    float toStateQVal = _qTable.getQValue(virtualToSAP);
                    float newQVal     = Mathf.Lerp(fromStateQVal, virtualReward + (DiscountValue * toStateQVal), LearnRate);
                    _qTable.setQValue(virtualFromSAP, newQVal);
                }
            }
        }
        // DynaQ Model
        if (ModelPlanning && !Refined)
        {
            _simpleMTable.setStateRewardPairAtStateActionPair(oldSAP, new StateRewardPair(currentState, reward));
            for (int i = 0; i < N; i++)
            {
                StateActionPair randSAP = _qTable.getRandomVisitedStateAndAction();
                StateRewardPair srp     = _simpleMTable.getStateRewardPair(randSAP);
                // Standard QLearning
                float qVal = _qTable.getQValue(randSAP);

                //MovementAction bAct = _qTable.getBestActionForState(currentState);
                MovementAction bAct = _qTable.getBestActionForState(srp.State);

                // new (current) parameters
                //StateActionPair cSAP = new StateActionPair(currentState, bestCurrentAction);
                StateActionPair cSAP    = new StateActionPair(srp.State, bAct);
                float           bQVal   = _qTable.getQValue(cSAP);
                float           newQVal = Mathf.Lerp(qVal, srp.Reward + (DiscountValue * bQVal), LearnRate);
                _qTable.setQValue(randSAP, newQVal);
            }
        }
    }
    /// <summary>
    /// Use this method to insert the next state plus the reward
    /// at the current StateActionPair position
    /// </summary>
    /// <param name="saPair"></param>
    /// <param name="stateReward"></param>

    public void setStateRewardPairAtStateActionPair(StateActionPair saPair, StateRewardPair stateReward)
    {
        _modelTable[saPair] = stateReward;
    }
 /// <summary>
 /// Initializes a new instance of the <see cref="ModelTableSimple+ModelTableSimpleKeyValuePair"/> class.
 /// </summary>
 /// <param name="stateActionPairKey">The state action pair key.</param>
 /// <param name="stateRewardPairValue">The state reward pair value.</param>
 public ModelTableSimpleKeyValuePair(StateActionPair stateActionPairKey, StateRewardPair stateRewardPairValue)
 {
     StateActionPairKey   = stateActionPairKey;
     StateRewardPairValue = stateRewardPairValue;
 }