public static void EstimateStateValuesWithTd0() { var env = new RandomWalkEnvironment(5, 3); var actualValues = new[] { 1.0 / 6, 2.0 / 6, 3.0 / 6, 4.0 / 6, 5.0 / 6 }; var estimator = new Td0ValueEstimator(); var estimates10 = estimator.Estimate(env, 10); var estimates100 = estimator.Estimate(env, 100); var estimates1000 = estimator.Estimate(env, 1000); var plotter = new Plotter(); var plt = plotter.Plt; plt.Title("TD0 random walk estimates after X episodes"); double[] dataX = { 1, 2, 3, 4, 5 }; plt.PlotScatter(dataX, actualValues, label: "actual"); plt.PlotScatter(dataX, estimates10, label: "10"); plt.PlotScatter(dataX, estimates100, label: "100"); plt.PlotScatter(dataX, estimates1000, label: "1000"); plt.Legend(); plotter.Show(); }
public void WhenStartingAtLeftmostPosition_StepLeft_Loses() { var env = new RandomWalkEnvironment(5, 0); var(_, reward, isDone) = env.DebugStep(-1); Assert.AreEqual(0, reward); Assert.True(isDone); }
public void WhenStartingAtRightmostPosition_StepRight_Wins() { var env = new RandomWalkEnvironment(5, 4); var(_, reward, isDone) = env.DebugStep(1); Assert.AreEqual(1, reward); Assert.True(isDone); }
public void CannotStep_WhenDone() { var env = new RandomWalkEnvironment(5, 4); env.DebugStep(1); // act & assert Assert.Throws <InvalidOperationException>(() => env.DebugStep(1)); Assert.Throws <InvalidOperationException>(() => env.DebugStep(-1)); }
public void WhenStartingInMiddle_StepRight_IncrementsPosition() { var env = new RandomWalkEnvironment(5, 3); var(state, reward, isDone) = env.DebugStep(1); Assert.AreEqual(4, state); Assert.AreEqual(0, reward); Assert.False(isDone); }
public double[] Estimate(RandomWalkEnvironment environment, int?episodeLimit = null) { _values = Enumerable.Range(0, environment.NumPositions).Select(_ => 0.0).ToArray(); var maxEpisodes = episodeLimit ?? 10000; for (var i = 0; i < maxEpisodes; i++) { var states = new List <int> { environment.Reset() }; var rewards = new List <double> { 0.0 }; var episodeLength = int.MaxValue; var t = 0; var tau = 0; for (; tau < episodeLength - 1; t++) { tau = t - _numSteps + 1; if (t < episodeLength) { var(nextState, reward, done) = environment.Step(); states.Add(nextState); rewards.Add(reward); if (done) { episodeLength = t + 1; } } if (tau >= 0) { var G = rewards.Skip(tau + 1).Take(_numSteps).Sum(); if (tau + _numSteps < episodeLength) { G += Value(states[tau + _numSteps]); } var currentValue = Value(states[tau]); var updatedValue = currentValue + _learningRate * (G - currentValue); _values[states[tau]] = updatedValue; } } } return(_values); }
public double[] Estimate(RandomWalkEnvironment environment, int?episodeLimit = null) { _values = Enumerable.Range(0, environment.NumPositions).Select(_ => 0.5).ToArray(); _returns = new StateReturns(environment.NumPositions); var maxEpisodes = episodeLimit ?? 10000; for (var i = 0; i < maxEpisodes; i++) { ImproveEstimates(environment); } return(_values); }
private void ImproveEstimates(RandomWalkEnvironment environment) { var rewardSum = 0.0; var episode = RandomWalkEpisode.Generate(environment); foreach (var t in Enumerable.Range(0, episode.Length - 1).Reverse()) { var state = episode.Steps[t].State; rewardSum += episode.Steps[t + 1].Reward; if (episode.TimeOfFirstVisit(state) == t) { _returns.Add(state, rewardSum); _values[state] = _returns.AverageReturnFrom(state); } } }
// todo: results are similar, but not the same as in book. why? private static void CompareNStepLengths() { var env = new RandomWalkEnvironment(19, 10); // actual probability of reaching goal state from given state var actualValues = Enumerable.Range(1, 19).Select(i => i / 20.0).ToArray(); var learningRates = new[] { 0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0 }; var nStepResults = new[] { 1, 2, 4, 8, 16 } .Select(numSteps => new NStepResult { Label = $"{numSteps}", CreateEstimatorFunc = learningRate => new NStepEstimator(learningRate, numSteps) }) .ToList(); foreach (var nStepResult in nStepResults) { foreach (var learningRate in learningRates) { var avgRmsErrorSum = 0.0; const int numRuns = 200; for (var i = 0; i < numRuns; i++) { var estimator = nStepResult.CreateEstimatorFunc(learningRate); var estimates = estimator.Estimate(env, 10); var avgError = AvgRmsError(actualValues, estimates); avgRmsErrorSum += avgError; } nStepResult.RmsErrors.Add(avgRmsErrorSum / numRuns); } } var plotter = new Plotter(); var plt = plotter.Plt; plt.Title("Average RMS error over 19 states and first 10 episodes"); plt.XLabel("Learning rate"); plt.YLabel("Avg. RMS error"); foreach (var nStepResult in nStepResults) { plt.PlotScatter(learningRates, nStepResult.RmsErrors.ToArray(), label: nStepResult.Label); } plt.Legend(); plotter.Show(); }
public double[] Estimate(RandomWalkEnvironment environment, int?episodeLimit = null) { _values = Enumerable.Range(0, environment.NumPositions).Select(_ => 0.5).ToArray(); var maxEpisodes = episodeLimit ?? 10000; for (var i = 0; i < maxEpisodes; i++) { var state = environment.Reset(); bool done; do { var(nextState, reward, isDone) = environment.Step(); done = isDone; _values[state] = Value(state) + _learningRate * (reward + Value(nextState) - Value(state)); state = nextState; } while (!done); } return(_values); }
private static void CompareMcAndTd0() { var env = new RandomWalkEnvironment(5, 3); var actualValues = new[] { 1.0 / 6, 2.0 / 6, 3.0 / 6, 4.0 / 6, 5.0 / 6 }; var mcEstimator = new McValueEstimator(); var td0ValueEstimator05 = new Td0ValueEstimator(0.05); var td0ValueEstimator10 = new Td0ValueEstimator(0.10); var td0ValueEstimator20 = new Td0ValueEstimator(0.20); var avgMcErrors = new double[100]; var avgTdErrors05 = new double[100]; var avgTdErrors10 = new double[100]; var avgTdErrors20 = new double[100]; for (var i = 0; i < 100; i++) { var mcErrors = new double[100]; var tdErrors05 = new double[100]; var tdErrors10 = new double[100]; var tdErrors20 = new double[100]; for (var j = 0; j < 100; j++) { var mcEstimates = mcEstimator.Estimate(env, i); var tdEstimates05 = td0ValueEstimator05.Estimate(env, i); var tdEstimates10 = td0ValueEstimator10.Estimate(env, i); var tdEstimates20 = td0ValueEstimator20.Estimate(env, i); mcErrors[j] = AvgRmsError(actualValues, mcEstimates); tdErrors05[j] = AvgRmsError(actualValues, tdEstimates05); tdErrors10[j] = AvgRmsError(actualValues, tdEstimates10); tdErrors20[j] = AvgRmsError(actualValues, tdEstimates20); } avgMcErrors[i] = mcErrors.Average(); avgTdErrors05[i] = tdErrors05.Average(); avgTdErrors10[i] = tdErrors10.Average(); avgTdErrors20[i] = tdErrors20.Average(); } var plotter = new Plotter(); var plt = plotter.Plt; plt.Title("MC and TD0 RMS error vs num episodes"); var dataX = Enumerable.Range(0, 100).Select(x => (double)x).ToArray(); plt.PlotScatter(dataX, avgMcErrors, label: "mc"); plt.PlotScatter(dataX, avgTdErrors05, label: "td, learning rate: 0.05"); plt.PlotScatter(dataX, avgTdErrors10, label: "td, learning rate: 0.10"); plt.PlotScatter(dataX, avgTdErrors20, label: "td, learning rate: 0.20"); plt.Legend(location: legendLocation.upperRight); plotter.Show(); }