private void FixedUpdate() { timer += Time.deltaTime; List <double> states = new List <double>(); List <double> qs = new List <double>(); states.Add(this.transform.rotation.x); states.Add(this.transform.rotation.z); states.Add(this.transform.position.z); states.Add(ball.GetComponent <Rigidbody>().angularVelocity.x); states.Add(ball.GetComponent <Rigidbody>().angularVelocity.z); qs = ANN.SoftMax(ann.CalculateOutput(states)); double maxQ = qs.Max(); int maxQIndex = qs.ToList().IndexOf(maxQ); exploreRate = Mathf.Clamp(exploreRate - exploreDecay, minExploreRate, maxExploreRate); //check to see if we choose a random action if (UnityEngine.Random.Range(1, 100) < exploreRate) { maxQIndex = UnityEngine.Random.Range(0, 4); } //action 0 tilt right //action 1 tilt left //action 2 tilt forward //action 3 tilt backward //mapQIndex == 0 means action 0 if (maxQIndex == 0) { this.transform.Rotate(Vector3.right, tiltSpeed * (float)qs[maxQIndex]); } else if (maxQIndex == 1) { this.transform.Rotate(Vector3.right, -tiltSpeed * (float)qs[maxQIndex]); } else if (maxQIndex == 2) { this.transform.Rotate(Vector3.forward, tiltSpeed * (float)qs[maxQIndex]); } else if (maxQIndex == 3) { this.transform.Rotate(Vector3.forward, -tiltSpeed * (float)qs[maxQIndex]); } if (ball.GetComponent <BallState>().dropped) { reward = -1f; } else { reward = 0.1f; } Replay lastMemory = new Replay(this.transform.rotation.x, this.transform.rotation.z, ball.transform.position.z, ball.GetComponent <Rigidbody>().angularVelocity.x, ball.GetComponent <Rigidbody>().angularVelocity.z, reward); if (replayMemory.Count > mCapacity) { replayMemory.RemoveAt(0); } replayMemory.Add(lastMemory); //Q learning starts here //upto this point all we did is get an inputs and getting the result from ann, //rewarding accordingly and then storing them. if (ball.GetComponent <BallState>().dropped) { //looping backwards so the quality of the last memory get carried //backwards up through the list so we can attributed it's blame through //the list for (int i = replayMemory.Count - 1; i >= 0; --i) { //foreach memory we ran the ann //first we found out what are the q values of the current memory List <double> currentMemoryQValues = new List <double>(); //then we take the q values of the next memory List <double> nextMemoryQValues = new List <double>(); currentMemoryQValues = ANN.SoftMax(ann.CalculateOutput(replayMemory[i].states)); //find the maximum Q value of the current memories double maxQOld = currentMemoryQValues.Max(); //which action gave that q value int action = currentMemoryQValues.ToList().IndexOf(maxQOld); double feedback; //checking if the current memory is the last memeory //or if that memory reward is -1, if it is -1, it means, that ball was dropped //and every memory after this is meaningless, because this is the end of the //memories sequance if ((i == replayMemory.Count - 1) || (replayMemory[i].reward == -1f)) { feedback = replayMemory[i].reward; } else { nextMemoryQValues = ANN.SoftMax(ann.CalculateOutput(replayMemory[i + 1].states)); maxQ = nextMemoryQValues.Max(); feedback = (replayMemory[i].reward + discount * maxQ); } //adding the correct reward (Q value) to the current action currentMemoryQValues[action] = feedback; //using the feedback to train the ANN ann.Train(replayMemory[i].states, currentMemoryQValues); } if (timer > maxBalanceTime) { maxBalanceTime = timer; } timer = 0; ball.GetComponent <BallState>().dropped = false; this.transform.rotation = Quaternion.identity; ResetBall(); replayMemory.Clear(); failCount++; } }
private void FixedUpdate() { timer += Time.deltaTime; List<double> states = new List<double>(); List<double> qs = new List<double>(); states.Add(this.transform.rotation.x); states.Add(ball.transform.position.z); states.Add(ball.GetComponent<Rigidbody>().angularVelocity.x); qs = ann.SoftMax(ann.CalcOutput(states)); double maxQ = qs.Max(); Debug.Log("quality: " + maxQ); int maxQIndex = qs.ToList().IndexOf(maxQ); exploreRate = Mathf.Clamp(exploreRate - exploreDecay, minExploreRate, maxExploreRate); //if (Random.Range(0, 100) < exploreRate) // maxQIndex = Random.Range(0, 2); if (maxQIndex == 0) this.transform.Rotate(Vector3.right, tiltSpeed * (float)qs[maxQIndex]); else if (maxQIndex == 1) this.transform.Rotate(Vector3.right, -tiltSpeed * (float)qs[maxQIndex]); if (ball.GetComponent<BallState>().dropped) reward = -1f; else reward = 0.1f; Replay lastMemory = new Replay( this.transform.rotation.x, ball.transform.position.z, ball.GetComponent<Rigidbody>().angularVelocity.x, reward ); if (replayMemory.Count > memCap) replayMemory.RemoveAt(0); replayMemory.Add(lastMemory); if (ball.GetComponent<BallState>().dropped) { for (int i = replayMemory.Count - 1; i >= 0; i--) { List<double> outputsOld = new List<double>(); List<double> outputsNew = new List<double>(); outputsOld = ann.SoftMax(ann.CalcOutput(replayMemory[i].states)); double maxQOld = outputsOld.Max(); int action = outputsOld.ToList().IndexOf(maxQOld); double feedback; if (i == replayMemory.Count - 1 || replayMemory[i].reward == -1) { feedback = replayMemory[i].reward; } else { outputsNew = ann.SoftMax(ann.CalcOutput(replayMemory[i + 1].states)); maxQ = outputsNew.Max(); feedback = replayMemory[i].reward + discount * maxQ; } outputsOld[action] = feedback; ann.Train(replayMemory[i].states, outputsOld); } ResetBall(); failCount++; } }