// The main agent loop
    IEnumerator performAgentLoop()
    {
        float mr_step_reward;
        float ms_step_reward;


        int opponent_agent_strategy_type = -1; // 0 => random; 1 => q-learning; 2 => minimax-q

        if (o_rdm_select)
        {
            opponent_agent_strategy_type = 0;
            //Debug.Log("opponent random");
        }
        else if (o_q_select)
        {
            opponent_agent_strategy_type = 1;
            //Debug.Log("opponent q");
        }
        else if (o_mm_select)
        {
            opponent_agent_strategy_type = 2;
            //Debug.Log("opponent minimax");
        }

        int ms_pac_man_agent_startegy_type = -1;

        if (rdm_select)
        {
            ms_pac_man_agent_startegy_type = 0;
            //Debug.Log("agent random");
        }
        else if (q_select)
        {
            ms_pac_man_agent_startegy_type = 1;
            //Debug.Log("agent qlearning");
        }
        else if (mm_select)
        {
            ms_pac_man_agent_startegy_type = 2;
            //Debug.Log("agent minimax");
        }

        // yield return new WaitForSeconds(2); // wait for 1 second
        mrPacMan = new MrPacManAgent(gridController.grid.Explor, gridController.grid.Decay, gridController.grid.Learning_rate, gridController.grid.Discount_factor, opponent_agent_strategy_type, isTraining);
        msPacMan = new MsPacManAgent(gridController.grid.Explor, gridController.grid.Decay, gridController.grid.Learning_rate, gridController.grid.Discount_factor, ms_pac_man_agent_startegy_type, isTraining);



        while (agentsRunning)
        {
            // yield return new WaitForSeconds(2); // wait for 1 second
            yield return(null);

            // get the current state
            mr_curr_state = gridController.grid.GetCell(gridController.MrPy, gridController.MrPx);
            ms_curr_state = gridController.grid.GetCell(gridController.MsPy, gridController.MsPx);

            int mr_pac_man_action;
            int ms_pac_man_action;

            // get each agent's action
            mr_pac_man_action = mrPacMan.getAction(gridController.grid.GetCell(gridController.MrPy, gridController.MrPx));
            ms_pac_man_action = msPacMan.getAction(gridController.grid.GetCell(gridController.MsPy, gridController.MsPx));

            // in a random order, apply each agents action
            if (UnityEngine.Random.Range(0, 2) == 0)
            {
                // fist player will always get a chance to move
                // calculate reward based on chosen step and current state
                mr_step_reward   = calculateStepReward(mr_curr_state, mr_pac_man_action, 1); // update mr pac man's score before moving
                action_return_mr = applyMrPacManAction(mr_pac_man_action);
                //Debug.Log("mr step reward: " + mr_step_reward);


                // second player might have been eaten
                if (action_return_mr == 2)
                {
                    mr_step_reward = 100f;
                    ms_step_reward = -100f;
                }
                else if (action_return_mr == 3)
                { // or get to eat
                    mr_step_reward = -100f;
                    ms_step_reward = 100f;
                }
                else
                {
                    // calculate the reward after the other agent has already completed their action, but before taking the action
                    ms_step_reward   = calculateStepReward(ms_curr_state, ms_pac_man_action, 0);
                    action_return_ms = applyMsPacManAction(ms_pac_man_action);
                }
            }
            else   //otherwise ms pac man goes first
            {
                ms_step_reward   = calculateStepReward(ms_curr_state, ms_pac_man_action, 0);
                action_return_ms = applyMsPacManAction(ms_pac_man_action);
                //Debug.Log("ms step reward: " + ms_step_reward);
                if (action_return_ms == 2)
                {
                    mr_step_reward = -100f;
                    ms_step_reward = 100f;
                }
                else if (action_return_ms == 3)
                { // or get to eat
                    mr_step_reward = 100f;
                    ms_step_reward = -100f;
                }
                else
                {                                                                                // most of the time mr will just move
                    mr_step_reward   = calculateStepReward(mr_curr_state, mr_pac_man_action, 1); // update mr pac man's score before moving
                    action_return_mr = applyMrPacManAction(mr_pac_man_action);
                }
            }

            // update global score
            gridController.AddPoints(1, mr_step_reward);
            //gridController.mr_reward += mr_step_reward;
            gridController.AddPoints(0, ms_step_reward);
            //gridController.ms_reward += ms_step_reward;
            training_curr_step += 1;

            gridController.txtStep_MS.text = training_curr_step.ToString();
            gridController.txtStep_MR.text = training_curr_step.ToString();

            // get the new state of both players
            mr_new_state = gridController.grid.GetCell(gridController.MrPy, gridController.MrPx);
            ms_new_state = gridController.grid.GetCell(gridController.MsPy, gridController.MsPx);

            //Debug.Log("took actions");

            // LEARN

            //each agent should recieve some kind of reward
            // probably use multithreading so both agents can learn in parallel
            if (isTraining)                                                                                        // only learn if in training mode, not testing mode
            {
                mrPacMan.learn(mr_curr_state, mr_new_state, mr_step_reward, mr_pac_man_action, ms_pac_man_action); // q learning does not use opponent's action
                msPacMan.learn(ms_curr_state, ms_new_state, ms_step_reward, ms_pac_man_action, mr_pac_man_action); // minimax q
            }

            if (curr_step >= max_steps)
            {
                //agentsRunning = false; // stop the current thread from running
                gridController.ResetTable(); // reset the game table
                curr_step = 0;
                //StartCoroutine(performAgentLoop()); // start a new loop of training on the new table
            }
            else
            {
                curr_step++;
            }

            if (gridController.grid.isNoCandies() == true)
            {
                if (gridController.curr_match_score_mr > gridController.curr_match_score_ms)      // if mr has more points this match
                {
                    gridController.AddWin(1);                                                     // he gets a win
                }
                else if (gridController.curr_match_score_ms > gridController.curr_match_score_mr) // if ms has more points this match
                {
                    gridController.AddWin(0);
                    //gridController.games_won_ms += 1; // she gets a win
                }
                gridController.ResetTable();
            }

            // if game over the stop the agents from running
            if (training_curr_step >= max_training_steps)
            {
                // exit training
                //Debug.Log("QUIT TRAINING");
                agentsRunning = false;
                gridController.ResetTable();
            }
        }
    }