Пример #1
0
    private void FixedUpdate()
    {
        timer += Time.deltaTime;
        List <double> states = new List <double>();
        List <double> qs     = new List <double>();

        states.Add(this.transform.rotation.x);
        states.Add(this.transform.rotation.z);
        states.Add(this.transform.position.z);
        states.Add(ball.GetComponent <Rigidbody>().angularVelocity.x);
        states.Add(ball.GetComponent <Rigidbody>().angularVelocity.z);

        qs = ANN.SoftMax(ann.CalculateOutput(states));
        double maxQ      = qs.Max();
        int    maxQIndex = qs.ToList().IndexOf(maxQ);

        exploreRate = Mathf.Clamp(exploreRate - exploreDecay, minExploreRate, maxExploreRate);

        //check to see if we choose a random action
        if (UnityEngine.Random.Range(1, 100) < exploreRate)
        {
            maxQIndex = UnityEngine.Random.Range(0, 4);
        }

        //action 0 tilt right
        //action 1 tilt left
        //action 2 tilt forward
        //action 3 tilt backward
        //mapQIndex == 0 means action 0
        if (maxQIndex == 0)
        {
            this.transform.Rotate(Vector3.right, tiltSpeed * (float)qs[maxQIndex]);
        }
        else if (maxQIndex == 1)
        {
            this.transform.Rotate(Vector3.right, -tiltSpeed * (float)qs[maxQIndex]);
        }
        else if (maxQIndex == 2)
        {
            this.transform.Rotate(Vector3.forward, tiltSpeed * (float)qs[maxQIndex]);
        }
        else if (maxQIndex == 3)
        {
            this.transform.Rotate(Vector3.forward, -tiltSpeed * (float)qs[maxQIndex]);
        }

        if (ball.GetComponent <BallState>().dropped)
        {
            reward = -1f;
        }
        else
        {
            reward = 0.1f;
        }

        Replay lastMemory = new Replay(this.transform.rotation.x,
                                       this.transform.rotation.z,
                                       ball.transform.position.z,
                                       ball.GetComponent <Rigidbody>().angularVelocity.x,
                                       ball.GetComponent <Rigidbody>().angularVelocity.z,
                                       reward);

        if (replayMemory.Count > mCapacity)
        {
            replayMemory.RemoveAt(0);
        }

        replayMemory.Add(lastMemory);

        //Q learning starts here
        //upto this point all we did is get an inputs and getting the result from ann,
        //rewarding accordingly and then storing them.
        if (ball.GetComponent <BallState>().dropped)
        {
            //looping backwards so the quality of the last memory get carried
            //backwards up through the list so we can attributed it's blame through
            //the list
            for (int i = replayMemory.Count - 1; i >= 0; --i)
            {
                //foreach memory we ran the ann
                //first we found out what are the q values of the current memory
                List <double> currentMemoryQValues = new List <double>();
                //then we take the q values of the next memory
                List <double> nextMemoryQValues = new List <double>();
                currentMemoryQValues = ANN.SoftMax(ann.CalculateOutput(replayMemory[i].states));

                //find the maximum Q value of the current memories
                double maxQOld = currentMemoryQValues.Max();
                //which action gave that q value
                int action = currentMemoryQValues.ToList().IndexOf(maxQOld);

                double feedback;
                //checking if the current memory is the last memeory
                //or if that memory reward is -1, if it is -1, it means, that ball was dropped
                //and every memory after this is meaningless, because this is the end of the
                //memories sequance
                if ((i == replayMemory.Count - 1) || (replayMemory[i].reward == -1f))
                {
                    feedback = replayMemory[i].reward;
                }
                else
                {
                    nextMemoryQValues = ANN.SoftMax(ann.CalculateOutput(replayMemory[i + 1].states));
                    maxQ     = nextMemoryQValues.Max();
                    feedback = (replayMemory[i].reward + discount * maxQ);
                }

                //adding the correct reward (Q value) to the current action
                currentMemoryQValues[action] = feedback;
                //using the feedback to train the ANN
                ann.Train(replayMemory[i].states, currentMemoryQValues);
            }

            if (timer > maxBalanceTime)
            {
                maxBalanceTime = timer;
            }

            timer = 0;

            ball.GetComponent <BallState>().dropped = false;
            this.transform.rotation = Quaternion.identity;
            ResetBall();
            replayMemory.Clear();
            failCount++;
        }
    }
    private void FixedUpdate() {
        timer += Time.deltaTime;
        List<double> states = new List<double>();
        List<double> qs = new List<double>();

        states.Add(this.transform.rotation.x);
        states.Add(ball.transform.position.z);
        states.Add(ball.GetComponent<Rigidbody>().angularVelocity.x);

        qs = ann.SoftMax(ann.CalcOutput(states));
        double maxQ = qs.Max();
        Debug.Log("quality: " + maxQ);

        int maxQIndex = qs.ToList().IndexOf(maxQ);
        exploreRate = Mathf.Clamp(exploreRate - exploreDecay, minExploreRate, maxExploreRate);

        //if (Random.Range(0, 100) < exploreRate)
        //    maxQIndex = Random.Range(0, 2);

        if (maxQIndex == 0)
            this.transform.Rotate(Vector3.right, tiltSpeed * (float)qs[maxQIndex]);
        else if (maxQIndex == 1)
            this.transform.Rotate(Vector3.right, -tiltSpeed * (float)qs[maxQIndex]);

        if (ball.GetComponent<BallState>().dropped)
            reward = -1f;
        else
            reward = 0.1f;

        Replay lastMemory = new Replay(
                this.transform.rotation.x,
                ball.transform.position.z,
                ball.GetComponent<Rigidbody>().angularVelocity.x,
                reward
            );

        if (replayMemory.Count > memCap)
            replayMemory.RemoveAt(0);

        replayMemory.Add(lastMemory);

        if (ball.GetComponent<BallState>().dropped) {
            for (int i = replayMemory.Count - 1; i >= 0; i--) {
                List<double> outputsOld = new List<double>();
                List<double> outputsNew = new List<double>();

                outputsOld = ann.SoftMax(ann.CalcOutput(replayMemory[i].states));

                double maxQOld = outputsOld.Max();
                int action = outputsOld.ToList().IndexOf(maxQOld);
                double feedback;

                if (i == replayMemory.Count - 1 || replayMemory[i].reward == -1) {
                    feedback = replayMemory[i].reward;
                } else {
                    outputsNew = ann.SoftMax(ann.CalcOutput(replayMemory[i + 1].states));
                    maxQ = outputsNew.Max();
                    feedback = replayMemory[i].reward + discount * maxQ;
                }

                outputsOld[action] = feedback;
                ann.Train(replayMemory[i].states, outputsOld);
            }
            ResetBall();
            failCount++;
        }

    }