public static void Run() { Console.WriteLine("Beginning q-learning maze"); Console.WriteLine("Setting up state"); int numStates = 12; var qMaze = QMaze.CreateDemo(12); //CreateMaze(numStates); double[][] rewardMatrix = CreateRewards(qMaze.NumStates); double[][] qualityMaxtrix = CreateQuality(qMaze.NumStates); int goal = 11; double gamma = .5;//discount factor double learnRate = .5; int maxEpochs = 1000; Train(qMaze, rewardMatrix, qualityMaxtrix, goal, gamma, learnRate, maxEpochs); Console.WriteLine("Done."); Print(qualityMaxtrix); Console.WriteLine("Solution"); Walk(qMaze, qualityMaxtrix); Console.WriteLine("End demo"); Console.ReadLine(); }
/// <summary> /// The Q-learning algorithm sometimes goes from the current state to a random next state. /// </summary> /// <param name="s"></param> /// <param name="qMaze"></param> /// <returns></returns> /// <remarks>So, if the current state s is 5, then GetRandNextState returns either 1 or 6 or 9 with equal probability (0.33 each)</remarks> public static int GetRandNextState(int s, QMaze qMaze) { List <int> possNextStates = GetPossibleNextStates(s, qMaze); int ct = possNextStates.Count; int idx = rnd.Next(0, ct); return(possNextStates[idx]); }
/// <summary> /// After the quality matrix has been computed, /// it can be used to find an optimal path from any starting state to the goal state. /// the method assumes that the goal state is reachable from the starting state /// </summary> /// <param name="qMaze"></param> /// <param name="quality"></param> private static void Walk(QMaze qMaze, double[][] quality) { int curr = qMaze.Start; int next; Console.Write(curr + "->"); while (curr != qMaze.Goal) { next = ArgMax(quality[curr]); Console.Write(next + "->"); curr = next; } Console.WriteLine("done"); }
/// <summary> /// Q-learning algorithm needs to know what states the system can transition to, given a current state. In this example, a state of the system is the same as the location in the maze so there are only 12 states /// </summary> /// <param name="s">State of system in demoe is same as location</param> /// <param name="qMaze"></param> /// <returns>For example, if the current state s is 5, then GetPossNextStates returns a List<int> collection holding (1, 6, 9)</returns> public static List <int> GetPossibleNextStates(int s, QMaze qMaze) { var FT = qMaze.FT; List <int> result = new List <int>(); for (int j = 0; j < FT.Length; ++j) { if (FT[s][j] == 1) { result.Add(j); } } return(result); }
/// <summary> /// After the quality matrix has been computed, /// it can be used to find an optimal path from any starting state to the goal state. /// the method assumes that the goal state is reachable from the starting state /// </summary> /// <param name="qMaze"></param> /// <param name="quality"></param> private static void Walk(QMaze qMaze, double[][] quality) { int curr = qMaze.Start; int next; /* original algorithm assumed fixed start. removed due to needing random sequence. * this however prevents the ability to walk the maze after the solution is found because. * the alogrithm does not set quality values for the starting rank. * instead, will use no-op as the fixed staring pointing*/ /* * int maxI = 0; * int maxK = 0; * var bestQ = double.MinValue; * for (var i = 0; i < quality.Length; i++) * { * for (var k = 0; k < quality[i].Length; k++) * { * if (quality[i][k] > bestQ) * { * bestQ = quality[i][k]; * maxI = i; * maxK = k; * } * } * } * var opCodeI = QOpCodeLearingGenerator.OpCodes[maxI]; * var opCodeK = QOpCodeLearingGenerator.OpCodes[maxK]; */ var opCode = QOpCodeLearingGenerator.OpCodes[curr]; List <OpCode> solution = new List <OpCode>(); solution.Add(opCode); Console.Write(opCode + "->"); while (curr != qMaze.Goal) { next = ArgMax(quality[curr]); opCode = QOpCodeLearingGenerator.OpCodes[next]; solution.Add(opCode); Console.Write(opCode + "->"); curr = next; } var writer = new ILInstructionWriter(solution); List <ILInstruction> ilInstructions = writer.GetInstructionStream(); var engine = new ILInstructionEngine(); dynamic[] args = new dynamic[] { 1 }; var result = engine.ExecuteTyped(ilInstructions, args: args); Console.WriteLine("done"); }
private static QMaze CreateMaze(int ns) { return(QMaze.CreateDemo(ns)); }
/// <summary> /// The key update equation for Q-learning is based on the mathematical Bellman equation /// </summary> /// <param name="qMaze"></param> /// <param name="rewards"></param> /// <param name="quality"></param> /// <param name="goal"></param> /// <param name="gamma"></param> /// <param name="learnRate"></param> /// <param name="maxEpochs"></param> private static void Train(QMaze qMaze, double[][] rewards, double[][] quality, int goal, double gamma, double learnRate, int maxEpochs) { /* * loop maxEpochs times * set currState = a random state * while currState != goalState * pick a random next-state but don't move yet * find largest Q for all next-next-states * update Q[currState][nextState] using Bellman * move to nextState * end-while * end-loop */ for (int epoch = 0; epoch < maxEpochs; ++epoch) { int currState = rnd.Next(0, rewards.Length); //The number of training epochs must be determined by trial and error. // An alternative design is to iterate until the values in // the Q matrix don’t change, or until they stabilize to very small changes //per iteration. while (true) { int nextState = GetRandNextState(currState, qMaze); List <int> possNextNextStates = GetPossibleNextStates(nextState, qMaze); double maxQ = double.MinValue; for (int j = 0; j < possNextNextStates.Count; ++j) { int nns = possNextNextStates[j]; // short alias double q = quality[nextState][nns]; if (q > maxQ) { maxQ = q; } } /* * Imagine you’re in a maze. You see that you can go to three different rooms, A, B, C. * You pick B, but don’t move yet. * You ask a friend to go into room B and the friend tells you * that from room B you can go to rooms X, Y, Z and that of those * rooms Y has the best Q value. In other words, Y is the best next-next state. * */ quality[currState][nextState] = ((1 - learnRate) * quality[currState][nextState]) + (learnRate * (rewards[currState][nextState] + (gamma * maxQ))); currState = nextState; if (currState == goal) { break; } /* * The update equation has two parts. * The first part, ((1 - lrnRate) * Q[currState][nextState]), is called the exploit component * and adds a fraction of the old value. * * The second part, (lrnRate * (R[currState][nextState] + (gamma * maxQ))), * is called the explore component. * * Larger values of the lrnRate increase the influence of both current rewards and * future rewards (explore) at the expense of past rewards (exploit). * The value of gamma, the discount factor, influences the importance of future rewards. * */ } } }
public static void Run() { Console.WriteLine("Beginning q-learning maze"); Console.WriteLine("Setting up state"); int numStates = QOpCodeLearingGenerator.OpCodes.Count; var qMaze = QMaze.CreateDemo(numStates); //CreateMaze(numStates); qMaze.Start = 0; double[][] rewardMatrix = CreateRewards(qMaze.NumStates); double[][] qualityMaxtrix = CreateQuality(qMaze.NumStates); qMaze.Goal = QOpCodeLearingGenerator.RetIndex; // 11; double gamma = .5; //discount factor double learnRate = .5; int maxEpochs = 100000; //var args = new dynamic[] { "hello world" }; var argList = new List <object>(); argList.Add(new[] { 1, 2 }); var expected = new[] { 2, 1 };// args[0]; var hardCoded = new List <OpCode>(); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ldc_I4_1); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ldc_I4_0); hardCoded.Add(OpCodes.Ldelem); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ldc_I4_0); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ldc_I4_1); hardCoded.Add(OpCodes.Ldelem); hardCoded.Add(OpCodes.Stelem); hardCoded.Add(OpCodes.Stelem); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ret); var hcResult = ILStackFrameBuilder.BuildAndExecute(hardCoded, args: argList.ToArray()); Train(qMaze, rewardMatrix, qualityMaxtrix, qMaze.Goal, gamma, learnRate, maxEpochs, expected, argList.ToArray()); Console.WriteLine("Done."); //Print(qualityMaxtrix); Console.WriteLine("Solution"); Walk(qMaze, qualityMaxtrix); Console.WriteLine("End demo"); Console.ReadLine(); }
/// <summary> /// The key update equation for Q-learning is based on the mathematical Bellman equation /// </summary> /// <param name="qMaze"></param> /// <param name="rewards"></param> /// <param name="quality"></param> /// <param name="goal"></param> /// <param name="gamma"></param> /// <param name="learnRate"></param> /// <param name="maxEpochs"></param> private static void Train(QMaze qMaze, double[][] rewards, double[][] quality, int goal, double gamma, double learnRate, int maxEpochs, dynamic expectedResult, params dynamic[] args) { /* * loop maxEpochs times * set currState = a random state * while currState != goalState * pick a random next-state but don't move yet * find largest Q for all next-next-states * update Q[currState][nextState] using Bellman * move to nextState * end-while * end-loop */ var stack = new Stack <object>(); dynamic rewardValue = expectedResult; int maxOpCodeLength = 10; for (int epoch = 0; epoch < maxEpochs; ++epoch) { int startState = rnd.Next(0, rewards.Length); var currentOpCode = QOpCodeLearingGenerator.OpCodes[startState]; Console.Title = $"Epoch {epoch} of {maxEpochs} : {currentOpCode.Name}"; //Console.WriteLine($"testing {currentOpCode}"); if (currentOpCode.Name == ILOpCodeValueNativeNames.Ldc_I4_1) { string bp = ""; } var l = new List <OpCode>(); l.Add(currentOpCode); //The number of training epochs must be determined by trial and error. // An alternative design is to iterate until the values in // the Q matrix don’t change, or until they stabilize to very small changes //per iteration. int currState = startState; while (true && l.Count < maxOpCodeLength) { int nextState = GetRandNextState(currState, qMaze); var opCode = QOpCodeLearingGenerator.OpCodes[nextState]; l.Add(opCode); //TODO: make this smarter //List<int> possNextNextStates = GetPossibleNextStates(nextState, qMaze); List <int> possNextNextStates = QOpCodeLearingGenerator.OpIndexes; double maxQ = double.MinValue; for (int j = 0; j < possNextNextStates.Count; ++j) { int nns = possNextNextStates[j]; // short alias double q = quality[nextState][nns]; if (q > maxQ) { maxQ = q; } } /* * Imagine you’re in a maze. You see that you can go to three different rooms, A, B, C. * You pick B, but don’t move yet. * You ask a friend to go into room B and the friend tells you * that from room B you can go to rooms X, Y, Z and that of those * rooms Y has the best Q value. In other words, Y is the best next-next state. * */ ////refactor to evaluate if would return reward. /* * quality[currState][nextState] = * ((1 - learnRate) * quality[currState][nextState]) + * (learnRate * (rewards[currState][nextState] + (gamma * maxQ))); */ double reward = -.1; if (nextState == QOpCodeLearingGenerator.RetIndex) { var frame = ILStackFrameBuilder.BuildAndExecute(l, 3, args: args); if (frame.Exception != null) { reward = -.2; } else if (frame.ReturnResult != null) { var type = frame.ReturnResult.GetType(); var expectedType = expectedResult.GetType(); if (type == expectedType) { try { if (frame.ReturnResult == expectedResult) { reward = 1; var rewardSteps = string.Join(", ", l.ToArray()); Console.WriteLine($"Found reward {rewardSteps}."); } } catch (Exception ex) { } } } //var result = ExecuteOpCodes(l, timeoutSeconds: 3, args); //if (result.Error != null) // need to penalize errors. //{ // reward = -.2; //} //else if (result != null) //{ // if (result.Success && result.Result != null) // { // var type = result.Result.GetType(); // try // { // if (result.Result == expectedResult) // { // reward = 1; // var rewardSteps = string.Join(", ", l.ToArray()); // Console.WriteLine($"Found reward {rewardSteps}."); // } // } // catch (Exception ex) // { // } // } //} } quality[currState][nextState] = ((1 - learnRate) * quality[currState][nextState]) + (learnRate * (reward + (gamma * maxQ))); currState = nextState; if (currState == goal) { break; } /* * The update equation has two parts. * The first part, ((1 - lrnRate) * Q[currState][nextState]), is called the exploit component * and adds a fraction of the old value. * * The second part, (lrnRate * (R[currState][nextState] + (gamma * maxQ))), * is called the explore component. * * Larger values of the lrnRate increase the influence of both current rewards and * future rewards (explore) at the expense of past rewards (exploit). * The value of gamma, the discount factor, influences the importance of future rewards. * */ } } }