/* * Chooses the most greedy action (action towards highest Q) based off the current * position and the surrounding locations. Ties are broken randomly. */ private int greedyAction(CytoscapeNode currentPos, int algorithm) { int action = (int)(randomGenerator.NextDouble() * 4); double max = getQValue(currentPos, action, algorithm); double QValue; List <int> tiedActions = new List <int>(); // Look at each direction for (int i = 0; i < 4; i++) { QValue = getQValue(currentPos, i, algorithm); if (QValue > max) { max = QValue; action = i; // Have to reset the tiedAction list once a new max has been found tiedActions = new List <int>(); tiedActions.Add(i); } else if (QValue == max) { tiedActions.Add(i); } } int tieBreakerIndex = 0; if (tiedActions.Count > 1) { tieBreakerIndex = (int)(randomGenerator.NextDouble() * tiedActions.Count); action = tiedActions[tieBreakerIndex]; } return(action); }
/* * Position and action here are the state action pair (s',a') * Q(s, a) = Q(s, a) + alpha*[r + gamma*Q(s', a') - Q(s, a)] */ private void sarsaBellmanCalculation(CytoscapeNode s, int a, CytoscapeNode sPrime, int aPrime, int reward) { Tuple <CytoscapeNode, int> stateActionPair = new Tuple <CytoscapeNode, int>(s, a); Tuple <CytoscapeNode, int> stateActionPairPrime = new Tuple <CytoscapeNode, int>(sPrime, aPrime); SARSAQ[stateActionPair] = getQValue(stateActionPair, SARSA_EPISODE) + ALPHA * (reward + GAMMA * getQValue(stateActionPairPrime, SARSA_EPISODE) - getQValue(stateActionPair, SARSA_EPISODE)); }
// The current reward model is 0 if arriving in the goal state, else -1 private static int rewardForState(CytoscapeNode node, int goalID) { if (node.id == goalID) { return(0); } return(-1); }
public CytoscapeNode(CytoscapeNode copy) { this.id = copy.id; this.name = copy.name; this.heuristic = copy.heuristic; this.connections = copy.connections; this.path = copy.path; }
// Cytoscape required a source and target to be defined when rendering the network on the UI. // However, AStar will treat them as undirected connections, so this will make sure // We aren't accidentally looking at the wrong node. public int undirectedTarget(CytoscapeNode desiredSource) { if (source == desiredSource.id) { return(target); } else { return(source); } }
/* * Position and action here are the state action pair (s',a') * Q(s, a) = Q(s, a) + gamma[r + argMaxa'Q(s', a') - Q(s, a)] */ private void qLearningBellmanCalculation(CytoscapeNode s, int a, int reward) { Tuple <CytoscapeNode, int> stateActionPair = new Tuple <CytoscapeNode, int>(s, a); int aPrime = argMax(s); CytoscapeNode sPrime = getNewState(s, a); Tuple <CytoscapeNode, int> stateActionPairPrime = new Tuple <CytoscapeNode, int>(sPrime, aPrime); double QsPrimeaPrime = getQValue(stateActionPairPrime, QLEARNING_EPISODE); double QsA = getQValue(stateActionPair, QLEARNING_EPISODE); QLearningQ[stateActionPair] = getQValue(stateActionPair, QLEARNING_EPISODE) + ALPHA * (reward + GAMMA * QsPrimeaPrime - QsA); }
// Helper to keep track of the path being looked at every iteration private static void trackAnimationFrame(List <AnimationFrame> frames, CytoscapeNode current) { AStarAnimationFrame frame = new AStarAnimationFrame(); frame.frame = new List <AStarAnimationNode>(); AStarAnimationNode tempNode; foreach (CytoscapeNode node in current.path) { tempNode = new AStarAnimationNode(node.id); frame.frame.Add(tempNode); } frames.Add(frame); }
private int getStateReward(CytoscapeNode newNode) { if (newNode.cellType == DPCellType.Hole) { return(-100); } else if (newNode == goalNode) { return(100); } else { return(-1); } }
/* * Chooses an action that is random with probability epsilon, otherwise it is chosen greedily. */ private int epsilonGreedyAction(CytoscapeNode currentPos, int algorithm) { double prob = randomGenerator.NextDouble(); int temp; if (prob < epsilon) { temp = randomAction(); } else { temp = greedyAction(currentPos, algorithm); } return(temp); }
/* Determine the new state if the current policy is applied to the current node * Will return the same node if the current policy instructs to move into a wall */ private CytoscapeNode getNewState(CytoscapeNode currentNode, int action) { // Ignore walls, goal state is absorbing if (currentNode.cellType == DPCellType.Wall || currentNode.cellType == DPCellType.Goal || action == DIDNTMOVE) { return(currentNode); } // First determine what direction the action is. If the new state is a wall then return the current node // otherwise return the new state Tuple <int, int> newCoords = new Tuple <int, int>(0, 0); switch (action) { case LEFT: newCoords = new Tuple <int, int>(currentNode.x - 1, currentNode.y); break; case RIGHT: newCoords = new Tuple <int, int>(currentNode.x + 1, currentNode.y); break; case UP: newCoords = new Tuple <int, int>(currentNode.x, currentNode.y - 1); break; case DOWN: newCoords = new Tuple <int, int>(currentNode.x, currentNode.y + 1); break; default: throw new Exception("Attempting to take an invalid action."); } if (nodeMap[newCoords].cellType == DPCellType.Wall) { return(currentNode); } else { return(nodeMap[newCoords]); } }
private int getOptimalActionForState(CytoscapeNode node, int algorithm) { double max = Double.MinValue; int action = 0; double QValue; //4 possible actions for (int i = 0; i < 4; i++) { QValue = getQValue(node, i, algorithm); // If the QValue is 0, that means the state-action pair was never actually taken if (QValue > max && QValue != 0) { max = QValue; action = i; } } return(max == Double.MinValue ? -1 : action); }
// If the cell is slippery, there is a 80% chance of staying in the same cell // Otherwise, if there is a 100% chance of going in the direction of the policy private static double probabilityOfTransition(CytoscapeNode node, Dictionary <Tuple <int, int>, CytoscapeNode> nodeMap, int action, int actionAccordingToPolicy) { double probability = 1.0; if (node.cellType == DPCellType.Ice && action == DIDNTMOVE) { probability = SLIPPING_PROB; } else if (node.cellType == DPCellType.Ice && action == actionAccordingToPolicy) { probability = 1.0 - SLIPPING_PROB; } else if (action != actionAccordingToPolicy) { probability = 0.0; } return(probability); }
/* * Returns the action that will provide the highest reward given position */ private int argMax(CytoscapeNode currentPosition) { int max = Int32.MinValue; CytoscapeNode possiblePosition; int reward, action = 0; //4 possible actions for (int i = 0; i < 4; i++) { possiblePosition = getNewState(currentPosition, i); reward = getStateReward(possiblePosition); if (reward > max) { max = reward; action = i; } } return(action); }
/* * Currently using two different classes to perform simulations. One is prefaced with * Cytoscape, which is passed in via the UI's ajax call and used in the simulation. * The other is prefaced with Animation, which is a simplified form and is * passed back to the UI to animate. In addition to it being a simplified form, it * also does not contain any circular references, unlike the Cytoscape versions. * C# has issues serializing objects with circular references, unlike Javascript. */ public static Animation runSimulation(int startID, int goalID, CytoscapeParams cyParams) { //return testAnim(startID, goalID); Animation results = new Animation(); AStarSpecificAnimation aStarSpecific = new AStarSpecificAnimation(); aStarSpecific.frontierOverTime = new List <List <AStarAnimationNode> >(); List <AnimationFrame> frames = new List <AnimationFrame>(); bool goalFound = false; CytoscapeMap map = new CytoscapeMap(initializeInternalNodes(cyParams.nodes)); IntervalHeap <CytoscapeNode> frontier = new IntervalHeap <CytoscapeNode>(); CytoscapeNode current = map.getNode(startID); while (!goalFound) { //Add new frontier to priority queue addToFrontier(map, frontier, current); //Store path every iteration for animation trackAnimationFrame(frames, current); //Store the frontier every iteration for animation storeFrontierOverTime(aStarSpecific, frontier); //Get the next node to expand current = frontier.DeleteMax(); //When done we record the last frame's information and break if (current.id == goalID) { goalFound = true; trackAnimationFrame(frames, current); storeFrontierOverTime(aStarSpecific, frontier); } } results.frames = frames; results.simulationSpecific = aStarSpecific; return(results); }
private double getQValue(CytoscapeNode s, int a, int algorithm) { Tuple <CytoscapeNode, int> stateActionPair = new Tuple <CytoscapeNode, int>(s, a); if (algorithm == QLEARNING_EPISODE) { if (!QLearningQ.ContainsKey(stateActionPair)) { QLearningQ.Add(stateActionPair, 0.0); } return(QLearningQ[stateActionPair]); } else { if (!SARSAQ.ContainsKey(stateActionPair)) { SARSAQ.Add(stateActionPair, 0.0); } return(SARSAQ[stateActionPair]); } }
private static void addToFrontier(CytoscapeMap map, IntervalHeap <CytoscapeNode> frontier, CytoscapeNode node) { CytoscapeNode tempNode; foreach (CytoscapeConnection connection in node.connections) { int undirectedTargetID = connection.undirectedTarget(node); tempNode = map.getNode(undirectedTargetID); // Discard cyclic paths if (!node.hasVisitedNode(undirectedTargetID)) { // Keep track of the path taken if (node.path == null || !node.path.Any()) { node.path = new List <CytoscapeNode>(); node.path.Add(node); } // Make sure to be duplicating the path instead of pointing at node's path field tempNode.path = new List <CytoscapeNode>(node.path); tempNode.path.Add(tempNode); // f is the heuristic plus the distance traveled so far tempNode.distance = node.distance + connection.distance; tempNode.f = tempNode.heuristic + tempNode.distance; frontier.Add(tempNode); } } }
public Animation runSimulation(CytoscapeParams cyParams) { Animation results = new Animation(); List <List <int> > QLearning = new List <List <int> >(); List <List <int> > SARSA = new List <List <int> >(); randomGenerator = new Random(); int numEpisodes = cyParams.nodes.Count * 3; timeHorizon = cyParams.nodes.Count * 3; // Setup maps from coordinates or ID to the nodes nodeMap = new Dictionary <Tuple <int, int>, CytoscapeNode>(); nodeIDMap = new Dictionary <int, CytoscapeNode>(); for (int i = 0; i < cyParams.nodes.Count; i++) { Tuple <int, int> coords = new Tuple <int, int>(cyParams.nodes[i].x, cyParams.nodes[i].y); nodeMap.Add(coords, cyParams.nodes[i]); nodeIDMap.Add(i, cyParams.nodes[i]); } epsilon = 0.9; originalEpsilon = epsilon; startID = cyParams.startID; goalID = cyParams.goalID; startNode = nodeIDMap[startID]; goalNode = nodeIDMap[goalID]; Tuple <List <string>, int> QLearningActionRewardPair; Tuple <List <string>, int> SARSAActionRewardPair; List <List <int> > QLearningEpisodes = new List <List <int> >(); List <List <int> > SARSAEpisodes = new List <List <int> >(); results.frames = new List <AnimationFrame>(); for (int episodeNumber = 0; episodeNumber < numEpisodes; episodeNumber++) { // Run the episode for each algorithm QLearningActionRewardPair = runEpisode(QLEARNING_EPISODE); SARSAActionRewardPair = runEpisode(SARSA_EPISODE); // Epsilon decreases every 10 episodes if (episodeNumber >= 10 && episodeNumber % 10 == 0) { epsilon = originalEpsilon / (episodeNumber / 10); if (epsilon <= 0.009) { epsilon = 0.0; } } // An animation frame will be either the QLearning or SARSA policy over time RLAnimationFrame frame = new RLAnimationFrame(); frame.QLearningPolicy = collectCurrentOptimalPolicy(cyParams.nodes, QLEARNING_EPISODE); frame.SARSAPolicy = collectCurrentOptimalPolicy(cyParams.nodes, SARSA_EPISODE); frame.QLearningEpisodeStates = QLearningActionRewardPair.Item1; frame.SARSAEpisodeStates = SARSAActionRewardPair.Item1; results.frames.Add(frame); } return(results); }