public CartPoleAgent(ApiService client, EnvCreateResponse environment, int numberOfActions, CartPoleObservationSpaceInfo observationSpaceInfo) { this.client = client; this.environment = environment; this.numberOfActions = numberOfActions; this.numberOfObservations = observationSpaceInfo.Shape[0]; this.observationSpaceInfo = observationSpaceInfo; // Since CartPole is a continuous space and infinite, we discretize it with buckets this.buckets = this.InitializeBuckets(new int[] { 1, 1, 8, 10 }); this.states = this.InitializeStates(this.buckets); this.q = this.InitializeQTable(this.states.Count, this.numberOfActions); }
public async Task Initialize() { this.environment = await this.client.EnvCreate("CartPole-v0"); var actionSpaceInfo = await this.client.EnvActionSpaceInfo <CartPoleActionSpaceInfo>(environment.InstanceID); var observationSpaceInfo = await this.client.EnvObservationSpaceInfo <CartPoleObservationSpaceInfo>(environment.InstanceID); // Rebid our velocity and angular velocity parameters, the pole should stand still as much as possible observationSpaceInfo.Info.High[1] = 0.5; observationSpaceInfo.Info.Low[1] = -0.5; observationSpaceInfo.Info.High[3] = ConvertToRadians(50); observationSpaceInfo.Info.Low[3] = -ConvertToRadians(50); // Set up the new learning agent this.agent = new CartPoleAgent(this.client, this.environment, actionSpaceInfo.Info.N, observationSpaceInfo.Info); }
/** * An experiment will run, whereafter it will constantly repeat the following loop: * 1. Observe environment state * 2. Based on the observation take an action * 3. Calculate the reward for the action and update our policy * * This we will do EPISODE_COUNT times or until we converge (when we have a convergence detector) */ public async Task Run() { await this.Initialize(); Stopwatch sw = new Stopwatch(); // @todo: start monitoring for (var episode = 0; episode < EPISODE_COUNT; episode++) { sw.Start(); // Reset the whole environment to start over var environment = new EnvStepResponse <double[]>() { IsDone = false, Observation = (await this.client.EnvReset <double[]>(this.environment.InstanceID)).Observation, Reward = 0.0 }; // Keep executing while we can: // if environment.IsDone, then the pole tipped to far, or we died so we stop // if t >= maxTimeSteps then we did not solve it fast enough var t = 0; while (!environment.IsDone && t < MAX_TIME_STEPS) { environment = await this.agent.Act(episode, environment.Observation); t++; } sw.Stop(); Console.WriteLine($"Episode {episode} ended after {t} timesteps in {sw.Elapsed}"); sw.Reset(); } // @todo: stop monitoring }