internal static void TestStackFrameBuilder() { var opcodes = new List <OpCode> { OpCodes.Nop, }; var result = ILStackFrameBuilder.BuildAndExecute(opcodes); opcodes.Add(OpCodes.Ldarg_0); var result2 = ILStackFrameBuilder.BuildAndExecute(opcodes, args: new object[] { 1 }); System.Diagnostics.Debug.Assert(((int)result2.ReturnResult) == 1); }
public void BuildAndExecuteOpCodesWithTimeoutTest() { var opCode1 = OpCodes.Ldc_I4_1; var opCode2 = OpCodes.Ret; var opCodes = (new[] { opCode1, opCode2 }).ToList(); var frame = ILStackFrameBuilder.BuildAndExecute(opCodes, 1); Assert.IsTrue(frame.Stream.Count == 2); Assert.IsTrue(frame.Stream[0].OpCode == opCode1); Assert.IsTrue(frame.Stream[1].OpCode == opCode2); var actual = frame.ReturnResult; Assert.IsNotNull(actual); var expected = 1; Assert.IsTrue((int)actual == expected, $"Actual:{actual}\r\nExpected:{expected}\r\n"); }
public void BuildAndExecuteILInstructionsTest() { var opCode1 = OpCodes.Ldc_I4_1; var opCode2 = OpCodes.Ret; var instruction1 = ILInstruction.Create(opCode1); var instruction2 = ILInstruction.Create(opCode2); var instructions = (new[] { instruction1, instruction2 }).ToList(); var frame = ILStackFrameBuilder.BuildAndExecute(instructions); Assert.IsTrue(frame.Stream.Count == 2); Assert.IsTrue(frame.Stream[0].OpCode == opCode1); Assert.IsTrue(frame.Stream[1].OpCode == opCode2); var actual = frame.ReturnResult; Assert.IsNotNull(actual); var expected = 1; Assert.IsTrue((int)actual == expected, $"Actual:{actual}\r\nExpected:{expected}\r\n"); }
public void TestEmptyStackWithoutArgsOrLocalsAndNoOperandExclusions() { var allOpCodes = OpCodeLookup.OpCodes.Select(x => x.Value).AsQueryable<OpCode>(); var opcodesbyname = OpCodeLookup.OpCodesByName; var filters = OpCodeFilters.EmptyStackWithNoArgsLocalsAndNoInlineOperandFilters(); allOpCodes = allOpCodes.Where(x => filters.All((filter) => filter(x))); var rem = allOpCodes.ToList(); Assert.IsTrue(rem.Count == 12); for (var i = 0; i < rem.Count; i++) { var opCodes = rem.Skip(i).Take(1).ToList(); opCodes.Add(OpCodes.Ret); var result = ILStackFrameBuilder.BuildAndExecute(opCodes); Assert.IsTrue(result.ExecutedInstructions == 2); Assert.IsNull(result.Exception); } }
public void TestBuildEmptyStackWithArgsAndLocalsAndInlineOnlyOperandFilters() { var allOpCodes = OpCodeLookup.OpCodes.Select(x => x.Value).AsQueryable<OpCode>(); var opcodesbyname = OpCodeLookup.OpCodesByName; List<object> args = new List<object>(); List<ILVariable> iLVariables = new List<ILVariable>(); var filters = OpCodeFilters.BuildEmptyStackWithArgsAndLocalsAndInlineOnlyOperandFilters(args.ToArray(), iLVariables.ToArray()); int expected = 12; var rem = allOpCodes.Where(x => filters.All((filter) => filter(x))).ToList(); Assert.IsTrue(rem.Count == 12); for (var argCount = 1; argCount < 5; argCount++) { args.Add(argCount - 1); filters = OpCodeFilters.BuildEmptyStackWithArgsAndLocalsAndInlineOnlyOperandFilters(args.ToArray(), iLVariables.ToArray()); expected += 1; rem = allOpCodes.Where(x => filters.All((filter) => filter(x))).ToList(); Assert.IsTrue(rem.Count == expected); } for (var variableCount = 1; variableCount < 5; variableCount++) { iLVariables.Add(new ILVariable { Name = $"var{variableCount}", Index = variableCount - 1, Type = typeof(int), Value = variableCount - 1 }); filters = OpCodeFilters.BuildEmptyStackWithArgsAndLocalsAndInlineOnlyOperandFilters(args.ToArray(), iLVariables.ToArray()); expected += 1; rem = allOpCodes.Where(x => filters.All((filter) => filter(x))).ToList(); Assert.IsTrue(rem.Count == expected); } //rem = allOpCodes.ToList(); ; for (var i = 0; i < rem.Count; i++) { var opCodes = rem.Skip(i).Take(1).ToList(); opCodes.Add(OpCodes.Ret); var result = ILStackFrameBuilder.BuildAndExecute(opCodes, args: args.ToArray(), locals: iLVariables.ToArray()); Assert.IsTrue(result.ExecutedInstructions == 2); Assert.IsNull(result.Exception); } }
public static void Run() { Console.WriteLine("Beginning q-learning maze"); Console.WriteLine("Setting up state"); int numStates = QOpCodeLearingGenerator.OpCodes.Count; var qMaze = QMaze.CreateDemo(numStates); //CreateMaze(numStates); qMaze.Start = 0; double[][] rewardMatrix = CreateRewards(qMaze.NumStates); double[][] qualityMaxtrix = CreateQuality(qMaze.NumStates); qMaze.Goal = QOpCodeLearingGenerator.RetIndex; // 11; double gamma = .5; //discount factor double learnRate = .5; int maxEpochs = 100000; //var args = new dynamic[] { "hello world" }; var argList = new List <object>(); argList.Add(new[] { 1, 2 }); var expected = new[] { 2, 1 };// args[0]; var hardCoded = new List <OpCode>(); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ldc_I4_1); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ldc_I4_0); hardCoded.Add(OpCodes.Ldelem); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ldc_I4_0); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ldc_I4_1); hardCoded.Add(OpCodes.Ldelem); hardCoded.Add(OpCodes.Stelem); hardCoded.Add(OpCodes.Stelem); hardCoded.Add(OpCodes.Ldarg_0); hardCoded.Add(OpCodes.Ret); var hcResult = ILStackFrameBuilder.BuildAndExecute(hardCoded, args: argList.ToArray()); Train(qMaze, rewardMatrix, qualityMaxtrix, qMaze.Goal, gamma, learnRate, maxEpochs, expected, argList.ToArray()); Console.WriteLine("Done."); //Print(qualityMaxtrix); Console.WriteLine("Solution"); Walk(qMaze, qualityMaxtrix); Console.WriteLine("End demo"); Console.ReadLine(); }
/// <summary> /// The key update equation for Q-learning is based on the mathematical Bellman equation /// </summary> /// <param name="qMaze"></param> /// <param name="rewards"></param> /// <param name="quality"></param> /// <param name="goal"></param> /// <param name="gamma"></param> /// <param name="learnRate"></param> /// <param name="maxEpochs"></param> private static void Train(QMaze qMaze, double[][] rewards, double[][] quality, int goal, double gamma, double learnRate, int maxEpochs, dynamic expectedResult, params dynamic[] args) { /* * loop maxEpochs times * set currState = a random state * while currState != goalState * pick a random next-state but don't move yet * find largest Q for all next-next-states * update Q[currState][nextState] using Bellman * move to nextState * end-while * end-loop */ var stack = new Stack <object>(); dynamic rewardValue = expectedResult; int maxOpCodeLength = 10; for (int epoch = 0; epoch < maxEpochs; ++epoch) { int startState = rnd.Next(0, rewards.Length); var currentOpCode = QOpCodeLearingGenerator.OpCodes[startState]; Console.Title = $"Epoch {epoch} of {maxEpochs} : {currentOpCode.Name}"; //Console.WriteLine($"testing {currentOpCode}"); if (currentOpCode.Name == ILOpCodeValueNativeNames.Ldc_I4_1) { string bp = ""; } var l = new List <OpCode>(); l.Add(currentOpCode); //The number of training epochs must be determined by trial and error. // An alternative design is to iterate until the values in // the Q matrix don’t change, or until they stabilize to very small changes //per iteration. int currState = startState; while (true && l.Count < maxOpCodeLength) { int nextState = GetRandNextState(currState, qMaze); var opCode = QOpCodeLearingGenerator.OpCodes[nextState]; l.Add(opCode); //TODO: make this smarter //List<int> possNextNextStates = GetPossibleNextStates(nextState, qMaze); List <int> possNextNextStates = QOpCodeLearingGenerator.OpIndexes; double maxQ = double.MinValue; for (int j = 0; j < possNextNextStates.Count; ++j) { int nns = possNextNextStates[j]; // short alias double q = quality[nextState][nns]; if (q > maxQ) { maxQ = q; } } /* * Imagine you’re in a maze. You see that you can go to three different rooms, A, B, C. * You pick B, but don’t move yet. * You ask a friend to go into room B and the friend tells you * that from room B you can go to rooms X, Y, Z and that of those * rooms Y has the best Q value. In other words, Y is the best next-next state. * */ ////refactor to evaluate if would return reward. /* * quality[currState][nextState] = * ((1 - learnRate) * quality[currState][nextState]) + * (learnRate * (rewards[currState][nextState] + (gamma * maxQ))); */ double reward = -.1; if (nextState == QOpCodeLearingGenerator.RetIndex) { var frame = ILStackFrameBuilder.BuildAndExecute(l, 3, args: args); if (frame.Exception != null) { reward = -.2; } else if (frame.ReturnResult != null) { var type = frame.ReturnResult.GetType(); var expectedType = expectedResult.GetType(); if (type == expectedType) { try { if (frame.ReturnResult == expectedResult) { reward = 1; var rewardSteps = string.Join(", ", l.ToArray()); Console.WriteLine($"Found reward {rewardSteps}."); } } catch (Exception ex) { } } } //var result = ExecuteOpCodes(l, timeoutSeconds: 3, args); //if (result.Error != null) // need to penalize errors. //{ // reward = -.2; //} //else if (result != null) //{ // if (result.Success && result.Result != null) // { // var type = result.Result.GetType(); // try // { // if (result.Result == expectedResult) // { // reward = 1; // var rewardSteps = string.Join(", ", l.ToArray()); // Console.WriteLine($"Found reward {rewardSteps}."); // } // } // catch (Exception ex) // { // } // } //} } quality[currState][nextState] = ((1 - learnRate) * quality[currState][nextState]) + (learnRate * (reward + (gamma * maxQ))); currState = nextState; if (currState == goal) { break; } /* * The update equation has two parts. * The first part, ((1 - lrnRate) * Q[currState][nextState]), is called the exploit component * and adds a fraction of the old value. * * The second part, (lrnRate * (R[currState][nextState] + (gamma * maxQ))), * is called the explore component. * * Larger values of the lrnRate increase the influence of both current rewards and * future rewards (explore) at the expense of past rewards (exploit). * The value of gamma, the discount factor, influences the importance of future rewards. * */ } } }