예제 #1
0
        public static void Run()
        {
            Console.WriteLine("Beginning q-learning maze");

            Console.WriteLine("Setting up state");

            int numStates = 12;
            var qMaze     = QMaze.CreateDemo(12); //CreateMaze(numStates);

            double[][] rewardMatrix   = CreateRewards(qMaze.NumStates);
            double[][] qualityMaxtrix = CreateQuality(qMaze.NumStates);

            int    goal      = 11;
            double gamma     = .5;//discount factor
            double learnRate = .5;
            int    maxEpochs = 1000;

            Train(qMaze, rewardMatrix, qualityMaxtrix, goal, gamma, learnRate, maxEpochs);

            Console.WriteLine("Done.");
            Print(qualityMaxtrix);

            Console.WriteLine("Solution");

            Walk(qMaze, qualityMaxtrix);

            Console.WriteLine("End demo");
            Console.ReadLine();
        }
예제 #2
0
        /// <summary>
        /// The Q-learning algorithm sometimes goes from the current state to a random next state.
        /// </summary>
        /// <param name="s"></param>
        /// <param name="qMaze"></param>
        /// <returns></returns>
        /// <remarks>So, if the current state s is 5, then GetRandNextState returns either 1 or 6 or 9 with equal probability (0.33 each)</remarks>
        public static int GetRandNextState(int s, QMaze qMaze)
        {
            List <int> possNextStates = GetPossibleNextStates(s, qMaze);
            int        ct             = possNextStates.Count;
            int        idx            = rnd.Next(0, ct);

            return(possNextStates[idx]);
        }
예제 #3
0
        /// <summary>
        /// After the quality matrix has been computed,
        /// it can be used to find an optimal path from any starting state to the goal state.
        /// the method assumes that the goal state is reachable from the starting state
        /// </summary>
        /// <param name="qMaze"></param>
        /// <param name="quality"></param>
        private static void Walk(QMaze qMaze, double[][] quality)
        {
            int curr = qMaze.Start; int next;

            Console.Write(curr + "->");
            while (curr != qMaze.Goal)
            {
                next = ArgMax(quality[curr]);
                Console.Write(next + "->");
                curr = next;
            }
            Console.WriteLine("done");
        }
예제 #4
0
        /// <summary>
        /// Q-learning algorithm needs to know what states the system can transition to, given a current state. In this example, a state of the system is the same as the location in the maze so there are only 12 states
        /// </summary>
        /// <param name="s">State of system in demoe is same as location</param>
        /// <param name="qMaze"></param>
        /// <returns>For example, if the current state s is 5, then GetPossNextStates returns a List<int> collection holding (1, 6, 9)</returns>
        public static List <int> GetPossibleNextStates(int s, QMaze qMaze)
        {
            var        FT     = qMaze.FT;
            List <int> result = new List <int>();

            for (int j = 0; j < FT.Length; ++j)
            {
                if (FT[s][j] == 1)
                {
                    result.Add(j);
                }
            }
            return(result);
        }
예제 #5
0
        /// <summary>
        /// After the quality matrix has been computed,
        /// it can be used to find an optimal path from any starting state to the goal state.
        /// the method assumes that the goal state is reachable from the starting state
        /// </summary>
        /// <param name="qMaze"></param>
        /// <param name="quality"></param>
        private static void Walk(QMaze qMaze, double[][] quality)
        {
            int curr = qMaze.Start; int next;

            /* original algorithm assumed fixed start. removed due to needing random sequence.
             * this however prevents the ability to walk the maze after the solution is found because.
             * the alogrithm does not set quality values for the starting rank.
             * instead, will use no-op as the fixed staring pointing*/
            /*
             * int maxI = 0;
             * int maxK = 0;
             * var bestQ = double.MinValue;
             * for (var i = 0; i < quality.Length; i++)
             * {
             * for (var k = 0; k < quality[i].Length; k++)
             * {
             *     if (quality[i][k] > bestQ)
             *     {
             *         bestQ = quality[i][k];
             *         maxI = i;
             *         maxK = k;
             *     }
             * }
             * }
             * var opCodeI = QOpCodeLearingGenerator.OpCodes[maxI];
             * var opCodeK = QOpCodeLearingGenerator.OpCodes[maxK];
             */
            var           opCode   = QOpCodeLearingGenerator.OpCodes[curr];
            List <OpCode> solution = new List <OpCode>();

            solution.Add(opCode);
            Console.Write(opCode + "->");
            while (curr != qMaze.Goal)
            {
                next   = ArgMax(quality[curr]);
                opCode = QOpCodeLearingGenerator.OpCodes[next];
                solution.Add(opCode);
                Console.Write(opCode + "->");
                curr = next;
            }

            var writer = new ILInstructionWriter(solution);
            List <ILInstruction> ilInstructions = writer.GetInstructionStream();
            var engine = new ILInstructionEngine();

            dynamic[] args   = new dynamic[] { 1 };
            var       result = engine.ExecuteTyped(ilInstructions, args: args);

            Console.WriteLine("done");
        }
예제 #6
0
 private static QMaze CreateMaze(int ns)
 {
     return(QMaze.CreateDemo(ns));
 }
예제 #7
0
        /// <summary>
        /// The key update equation for Q-learning is based on the mathematical Bellman equation
        /// </summary>
        /// <param name="qMaze"></param>
        /// <param name="rewards"></param>
        /// <param name="quality"></param>
        /// <param name="goal"></param>
        /// <param name="gamma"></param>
        /// <param name="learnRate"></param>
        /// <param name="maxEpochs"></param>
        private static void Train(QMaze qMaze, double[][] rewards, double[][] quality, int goal, double gamma, double learnRate, int maxEpochs)
        {
            /*
             * loop maxEpochs times
             * set currState = a random state
             * while currState != goalState
             * pick a random next-state but don't move yet
             * find largest Q for all next-next-states
             * update Q[currState][nextState] using Bellman
             * move to nextState
             * end-while
             * end-loop
             */

            for (int epoch = 0; epoch < maxEpochs; ++epoch)
            {
                int currState = rnd.Next(0, rewards.Length);
                //The number of training epochs must be determined by trial and error.
                // An alternative design is to iterate until the values in
                // the Q matrix don’t change, or until they stabilize to very small changes
                //per iteration.

                while (true)
                {
                    int        nextState          = GetRandNextState(currState, qMaze);
                    List <int> possNextNextStates = GetPossibleNextStates(nextState, qMaze);
                    double     maxQ = double.MinValue;
                    for (int j = 0; j < possNextNextStates.Count; ++j)
                    {
                        int    nns = possNextNextStates[j]; // short alias
                        double q   = quality[nextState][nns];
                        if (q > maxQ)
                        {
                            maxQ = q;
                        }
                    }

                    /*
                     * Imagine you’re in a maze. You see that you can go to three different rooms, A, B, C.
                     * You pick B, but don’t move yet.
                     * You ask a friend to go into room B and the friend tells you
                     * that from room B you can go to rooms X, Y, Z and that of those
                     * rooms Y has the best Q value. In other words, Y is the best next-next state.
                     * */
                    quality[currState][nextState] =
                        ((1 - learnRate) * quality[currState][nextState]) +
                        (learnRate * (rewards[currState][nextState] + (gamma * maxQ)));
                    currState = nextState;
                    if (currState == goal)
                    {
                        break;
                    }

                    /*
                     * The update equation has two parts.
                     * The first part, ((1 - lrnRate) * Q[currState][nextState]), is called the exploit component
                     * and adds a fraction of the old value.
                     *
                     * The second part, (lrnRate * (R[currState][nextState] + (gamma * maxQ))),
                     * is called the explore component.
                     *
                     * Larger values of the lrnRate increase the influence of both current rewards and
                     * future rewards (explore) at the expense of past rewards (exploit).
                     * The value of gamma, the discount factor, influences the importance of future rewards.
                     * */
                }
            }
        }
예제 #8
0
        public static void Run()
        {
            Console.WriteLine("Beginning q-learning maze");

            Console.WriteLine("Setting up state");



            int numStates = QOpCodeLearingGenerator.OpCodes.Count;
            var qMaze     = QMaze.CreateDemo(numStates); //CreateMaze(numStates);

            qMaze.Start = 0;
            double[][] rewardMatrix   = CreateRewards(qMaze.NumStates);
            double[][] qualityMaxtrix = CreateQuality(qMaze.NumStates);

            qMaze.Goal = QOpCodeLearingGenerator.RetIndex; // 11;
            double gamma     = .5;                         //discount factor
            double learnRate = .5;
            int    maxEpochs = 100000;

            //var args = new dynamic[] { "hello world" };

            var argList = new List <object>();

            argList.Add(new[] { 1, 2 });
            var expected = new[] { 2, 1 };// args[0];


            var hardCoded = new List <OpCode>();

            hardCoded.Add(OpCodes.Ldarg_0);
            hardCoded.Add(OpCodes.Ldc_I4_1);
            hardCoded.Add(OpCodes.Ldarg_0);
            hardCoded.Add(OpCodes.Ldc_I4_0);
            hardCoded.Add(OpCodes.Ldelem);


            hardCoded.Add(OpCodes.Ldarg_0);
            hardCoded.Add(OpCodes.Ldc_I4_0);
            hardCoded.Add(OpCodes.Ldarg_0);
            hardCoded.Add(OpCodes.Ldc_I4_1);
            hardCoded.Add(OpCodes.Ldelem);

            hardCoded.Add(OpCodes.Stelem);
            hardCoded.Add(OpCodes.Stelem);
            hardCoded.Add(OpCodes.Ldarg_0);
            hardCoded.Add(OpCodes.Ret);

            var hcResult = ILStackFrameBuilder.BuildAndExecute(hardCoded, args: argList.ToArray());


            Train(qMaze, rewardMatrix, qualityMaxtrix, qMaze.Goal, gamma, learnRate, maxEpochs, expected, argList.ToArray());

            Console.WriteLine("Done.");
            //Print(qualityMaxtrix);

            Console.WriteLine("Solution");

            Walk(qMaze, qualityMaxtrix);

            Console.WriteLine("End demo");
            Console.ReadLine();
        }
예제 #9
0
        /// <summary>
        /// The key update equation for Q-learning is based on the mathematical Bellman equation
        /// </summary>
        /// <param name="qMaze"></param>
        /// <param name="rewards"></param>
        /// <param name="quality"></param>
        /// <param name="goal"></param>
        /// <param name="gamma"></param>
        /// <param name="learnRate"></param>
        /// <param name="maxEpochs"></param>
        private static void Train(QMaze qMaze,
                                  double[][] rewards,
                                  double[][] quality,
                                  int goal,
                                  double gamma,
                                  double learnRate,
                                  int maxEpochs,
                                  dynamic expectedResult,
                                  params dynamic[] args)
        {
            /*
             * loop maxEpochs times
             * set currState = a random state
             * while currState != goalState
             * pick a random next-state but don't move yet
             * find largest Q for all next-next-states
             * update Q[currState][nextState] using Bellman
             * move to nextState
             * end-while
             * end-loop
             */

            var     stack           = new Stack <object>();
            dynamic rewardValue     = expectedResult;
            int     maxOpCodeLength = 10;

            for (int epoch = 0; epoch < maxEpochs; ++epoch)
            {
                int startState = rnd.Next(0, rewards.Length);

                var currentOpCode = QOpCodeLearingGenerator.OpCodes[startState];


                Console.Title = $"Epoch {epoch} of {maxEpochs} : {currentOpCode.Name}";
                //Console.WriteLine($"testing {currentOpCode}");
                if (currentOpCode.Name == ILOpCodeValueNativeNames.Ldc_I4_1)
                {
                    string bp = "";
                }
                var l = new List <OpCode>();
                l.Add(currentOpCode);
                //The number of training epochs must be determined by trial and error.
                // An alternative design is to iterate until the values in
                // the Q matrix don’t change, or until they stabilize to very small changes
                //per iteration.
                int currState = startState;
                while (true && l.Count < maxOpCodeLength)
                {
                    int nextState = GetRandNextState(currState, qMaze);
                    var opCode    = QOpCodeLearingGenerator.OpCodes[nextState];
                    l.Add(opCode);
                    //TODO: make this smarter
                    //List<int> possNextNextStates = GetPossibleNextStates(nextState, qMaze);
                    List <int> possNextNextStates = QOpCodeLearingGenerator.OpIndexes;
                    double     maxQ = double.MinValue;
                    for (int j = 0; j < possNextNextStates.Count; ++j)
                    {
                        int    nns = possNextNextStates[j]; // short alias
                        double q   = quality[nextState][nns];
                        if (q > maxQ)
                        {
                            maxQ = q;
                        }
                    }

                    /*
                     * Imagine you’re in a maze. You see that you can go to three different rooms, A, B, C.
                     * You pick B, but don’t move yet.
                     * You ask a friend to go into room B and the friend tells you
                     * that from room B you can go to rooms X, Y, Z and that of those
                     * rooms Y has the best Q value. In other words, Y is the best next-next state.
                     * */


                    ////refactor to evaluate if would return reward.

                    /*
                     * quality[currState][nextState] =
                     *  ((1 - learnRate) * quality[currState][nextState]) +
                     *  (learnRate * (rewards[currState][nextState] + (gamma * maxQ)));
                     */

                    double reward = -.1;
                    if (nextState == QOpCodeLearingGenerator.RetIndex)
                    {
                        var frame = ILStackFrameBuilder.BuildAndExecute(l, 3, args: args);
                        if (frame.Exception != null)
                        {
                            reward = -.2;
                        }
                        else if (frame.ReturnResult != null)
                        {
                            var type         = frame.ReturnResult.GetType();
                            var expectedType = expectedResult.GetType();
                            if (type == expectedType)
                            {
                                try
                                {
                                    if (frame.ReturnResult == expectedResult)
                                    {
                                        reward = 1;
                                        var rewardSteps = string.Join(", ", l.ToArray());
                                        Console.WriteLine($"Found reward {rewardSteps}.");
                                    }
                                }
                                catch (Exception ex)
                                {
                                }
                            }
                        }
                        //var result = ExecuteOpCodes(l, timeoutSeconds: 3, args);
                        //if (result.Error != null) // need to penalize errors.
                        //{
                        //    reward = -.2;
                        //}
                        //else if (result != null)
                        //{
                        //    if (result.Success && result.Result != null)
                        //    {
                        //        var type = result.Result.GetType();
                        //        try
                        //        {
                        //            if (result.Result == expectedResult)
                        //            {
                        //                reward = 1;
                        //                var rewardSteps = string.Join(", ", l.ToArray());
                        //                Console.WriteLine($"Found reward {rewardSteps}.");
                        //            }
                        //        }
                        //        catch (Exception ex)
                        //        {

                        //        }


                        //    }
                        //}
                    }

                    quality[currState][nextState] =
                        ((1 - learnRate) * quality[currState][nextState]) +
                        (learnRate * (reward + (gamma * maxQ)));



                    currState = nextState;
                    if (currState == goal)
                    {
                        break;
                    }

                    /*
                     * The update equation has two parts.
                     * The first part, ((1 - lrnRate) * Q[currState][nextState]), is called the exploit component
                     * and adds a fraction of the old value.
                     *
                     * The second part, (lrnRate * (R[currState][nextState] + (gamma * maxQ))),
                     * is called the explore component.
                     *
                     * Larger values of the lrnRate increase the influence of both current rewards and
                     * future rewards (explore) at the expense of past rewards (exploit).
                     * The value of gamma, the discount factor, influences the importance of future rewards.
                     * */
                }
            }
        }