private async Task <XContainer> GetStatus() { int reasonableNumber = Math.Max(1, (targetNumberOfWorkers * 3) / 4); int nearlyAll = targetNumberOfWorkers - 3; StringBuilder sb = new StringBuilder(serverAddress); sb.Append("status"); if (knownWorkers.Count >= reasonableNumber) { // we will try to set this repeatedly, so make sure it doesn't throw an exception // the second time reasonableReached.TrySetResult(true); } if (targetNumberOfWorkers < 0) { // we haven't seen any status yet. Don't add any predicates to the request, // so it will return immediately } else if (knownWorkers.Count < nearlyAll) { // wait until the epoch changes, or we get close to all, or a few seconds have passed sb.AppendFormat("?epochGreater={0}", epoch); sb.AppendFormat("&thresholdGreater=Worker:{0}", nearlyAll - 1); sb.Append("&timeout=2000"); } else { // wait until the epoch changes, or any machine state changes, or a few seconds have passed sb.AppendFormat("?epochGreater={0}", epoch); sb.AppendFormat("&versionGreater={0}", version); sb.Append("&timeout=30000"); } IHttpRequest request = ClusterInterface.HttpClient.Create(sb.ToString()); request.Timeout = 60 * 1000; // if the query doesn't eventually return, something is wrong try { using (IHttpResponse status = await request.GetResponseAsync()) { using (Stream response = status.GetResponseStream()) { using (var reader = System.Xml.XmlReader.Create(response)) { return(XDocument.Load(reader)); } } } } catch (Exception e) { logger.Log("Failed getting status: " + e.ToString()); return(null); } }
/// <summary> /// construct a new Computer object /// </summary> /// <param name="n">the unique name of the daemon</param> /// <param name="host">the computer the daemon is running on</param> /// <param name="rn">the rack the daemon is running on</param> /// <param name="rack">the scheduling queue associated with the computer's rack</param> /// <param name="cluster">the global scheduling queue associated with the cluster</param> /// <param name="pServer">the address of the daemon's http server for process scheduling</param> /// <param name="fServer">the address of the daemon's http server for file proxying</param> /// <param name="directory">the daemon's local directory</param> /// <param name="log">connection to the logging subsystem</param> public Computer(string n, string host, string rn, ProcessQueue rack, ProcessQueue cluster, string pServer, string fServer, string directory, ClusterInterface.ILogger log) { logger = log; name = n; localDirectory = directory; processServer = pServer; fileServer = fServer; computerName = host; rackName = rn; localQueue = new ProcessQueue(); rackQueue = rack; clusterQueue = cluster; logger.Log("Created computer " + name + " on host " + computerName + ":" + rackName + ":" + localDirectory + ":" + fileServer); // make the Task that CommandLoop blocks on; when finishWaiter is started it returns null // causing CommandLoop to exit. finishWaiter = new TaskCompletionSource <Process>(); childFinishWaiters = new HashSet <TaskCompletionSource <Process> >(); finishWaiter.Task.ContinueWith((t) => Task.Run(() => SetChildFinishWaiters())); // this is started when the Command Loop exits exited = new TaskCompletionSource <bool>(); nextTask = 1; }
public LocalScheduler(ClusterInterface.ILogger l) { logger = l; computers = new Dictionary <string, Computer>(); localities = new Dictionary <string, List <Computer> >(); racks = new Dictionary <string, Rack>(); clusterQueue = new ProcessQueue(); flusher = new Task(() => { }); clusterInterface = new PeloponneseInterface(); dummyCancelComputer = new Computer("dummy for canceling", "nowhere", "no rack", null, null, "no server", "no server", "no directory", logger); l.Log("LocalScheduler created"); }
public bool Initialize(LocalScheduler p, ClusterInterface.ILogger l) { parent = p; logger = l; epoch = 0; version = 0; targetNumberOfWorkers = -1; knownWorkers = new Dictionary<string, string>(); reasonableReached = new TaskCompletionSource<bool>(); shutdownTask = new TaskCompletionSource<XContainer>(); waitingForComputer = new List<Task>(); exited = new TaskCompletionSource<bool>(); jobGuid = Environment.GetEnvironmentVariable(Constants.EnvJobGuid); if (jobGuid == null) { logger.Log("Can't find environment variable " + Constants.EnvJobGuid + ": exiting"); return false; } serverAddress = Environment.GetEnvironmentVariable(Constants.EnvManagerServerUri); if (serverAddress == null) { logger.Log("Can't find environment variable " + Constants.EnvManagerServerUri + ": exiting"); return false; } var groupName = Environment.GetEnvironmentVariable(Constants.EnvProcessGroup); if (groupName == null) { logger.Log("Can't find environment variable " + Constants.EnvProcessGroup + ": exiting"); return false; } var procIdentifier = Environment.GetEnvironmentVariable(Constants.EnvProcessIdentifier); if (procIdentifier == null) { logger.Log("Can't find environment variable " + Constants.EnvProcessIdentifier + ": exiting"); return false; } var element = new XElement("ProcessDetails"); var status = element.ToString(); string registration = String.Format("{0}register?guid={1}&group={2}&identifier={3}", serverAddress, jobGuid, groupName, procIdentifier); IHttpRequest request = ClusterInterface.HttpClient.Create(registration); request.Timeout = 30 * 1000; // if this doesn't come back quickly, we'll get an exception and quit request.Method = "POST"; try { using (Stream upload = request.GetRequestStream()) { using (StreamWriter sw = new StreamWriter(upload)) { sw.Write(status); } } using (IHttpResponse response = request.GetResponse()) { logger.Log("Server registration succeeded"); return true; } } catch (NotHttpException e) { // if this failed, there's nothing much more we can do logger.Log("Server registration failed message " + e.Message + " status " + e.Response.StatusCode + ": " + e.Response.StatusDescription); return false; } catch (Exception e) { // if this failed, there's nothing much more we can do logger.Log("Server registration failed message " + e.Message); return false; } }
private async void ScheduleProcessInternal(Process process, List <ClusterInterface.Affinity> affinities, ClusterInterface.RunProcess callback) { logger.Log("Scheduling process " + process.Id); process.SetCallback(callback); Task rackBlocker; Task clusterBlocker; lock (this) { rackBlocker = Task.WhenAny(flusher, Task.Delay(rackDelay)); clusterBlocker = Task.WhenAny(flusher, Task.Delay(clusterDelay)); } bool isHardConstraint = affinities.Aggregate(false, (a, b) => a || b.isHardContraint); if (isHardConstraint) { // the constraint generator should have intersected the hard constraint into a single one Debug.Assert(affinities.Count() == 1); logger.Log("Process " + process.Id + " has a hard constraint"); } var allAffinities = affinities.SelectMany(a => a.affinities).Distinct(); var computerAffinities = allAffinities.Where(a => a.level == ClusterInterface.AffinityResourceLevel.Host); bool addedAny = false; // get a snapshot of available computers Dictionary <string, List <Computer> > localitySnapshot = new Dictionary <string, List <Computer> >(); lock (localities) { foreach (var c in localities) { localitySnapshot.Add(c.Key, c.Value); } } if (localitySnapshot.Count == 0) { await process.OnScheduled(null, -1, null, "No cluster computers available"); return; } var racksUsed = new List <string>(); foreach (var a in computerAffinities) { List <Computer> cl; if (localitySnapshot.TryGetValue(a.locality, out cl)) { addedAny = true; logger.Log("Adding Process " + process.Id + " to queues for computers with locality " + a.locality); foreach (var c in cl) { logger.Log("Adding Process " + process.Id + " to queue for computer " + c.Name); if (c.LocalQueue.AddProcess(process)) { // this returns true if p has been matched to a computer, in which case we // can stop adding it to queues logger.Log("Process " + process.Id + " claimed by computer " + c.Name); return; } } // remember the rack this computer was in, to include it for soft affinities below racksUsed.Add(cl.First().RackName); } } if (addedAny) { // hacky delay scheduling; wait until the upper level has finished adding processes in // the current stage, or some time has passed, before relaxing affinities if the process // had affinities for particular computers logger.Log("Process " + process.Id + " delay scheduling for rack"); await rackBlocker; } // reset flag before adding to racks addedAny = false; // get a snapshot of available racks Dictionary <string, Rack> rackSnapshot = new Dictionary <string, Rack>(); lock (racks) { foreach (var r in racks) { rackSnapshot.Add(r.Key, r.Value); } } var rackAffinities = allAffinities.Where(a => a.level == ClusterInterface.AffinityResourceLevel.Rack).Select(a => a.locality).Distinct(); if (!isHardConstraint) { rackAffinities = rackAffinities.Concat(racksUsed).Distinct(); } foreach (var a in rackAffinities) { Rack r; if (rackSnapshot.TryGetValue(a, out r)) { addedAny = true; logger.Log("Adding Process " + process.Id + " to queue for rack " + a); if (r.queue.AddProcess(process)) { // this returns true if p has been matched to a computer, in which case we // can stop adding it to queues logger.Log("Process " + process.Id + " claimed by rack " + a); return; } } } if (isHardConstraint) { // let the process know it won't get added to any more queues. This will signal the // upper layer if it didn't get added to any queues process.FinishedScheduling(); return; } if (addedAny) { // hacky delay scheduling; wait until the upper level has finished adding processes in // the current stage, or some time has passed, before relaxing affinities if the process // had affinities for particular racks logger.Log("Process " + process.Id + " delay scheduling for cluster"); await clusterBlocker; } logger.Log("Adding Process " + process.Id + " to queue for cluster"); clusterQueue.AddProcess(process); // let the process know it won't get added to any more queues process.FinishedScheduling(); }
/// <summary> /// discard all the processes on our local queue and unblock the finishWaiter /// causing the CommandLoop to exit /// </summary> public void ShutDown() { logger.Log("Computer " + name + " stopping local queue"); // stop the local queue accepting any more processes localQueue.ShutDown(); logger.Log("Computer " + name + " starting finishWaiter"); finishWaiter.SetResult(null); }
public bool Initialize(LocalScheduler p, ClusterInterface.ILogger l) { parent = p; logger = l; epoch = 0; version = 0; targetNumberOfWorkers = -1; knownWorkers = new Dictionary <string, string>(); reasonableReached = new TaskCompletionSource <bool>(); shutdownTask = new TaskCompletionSource <XContainer>(); waitingForComputer = new List <Task>(); exited = new TaskCompletionSource <bool>(); jobGuid = Environment.GetEnvironmentVariable(Constants.EnvJobGuid); if (jobGuid == null) { logger.Log("Can't find environment variable " + Constants.EnvJobGuid + ": exiting"); return(false); } serverAddress = Environment.GetEnvironmentVariable(Constants.EnvManagerServerUri); if (serverAddress == null) { logger.Log("Can't find environment variable " + Constants.EnvManagerServerUri + ": exiting"); return(false); } var groupName = Environment.GetEnvironmentVariable(Constants.EnvProcessGroup); if (groupName == null) { logger.Log("Can't find environment variable " + Constants.EnvProcessGroup + ": exiting"); return(false); } var procIdentifier = Environment.GetEnvironmentVariable(Constants.EnvProcessIdentifier); if (procIdentifier == null) { logger.Log("Can't find environment variable " + Constants.EnvProcessIdentifier + ": exiting"); return(false); } var element = new XElement("ProcessDetails"); var status = element.ToString(); string registration = String.Format("{0}register?guid={1}&group={2}&identifier={3}", serverAddress, jobGuid, groupName, procIdentifier); IHttpRequest request = ClusterInterface.HttpClient.Create(registration); request.Timeout = 30 * 1000; // if this doesn't come back quickly, we'll get an exception and quit request.Method = "POST"; try { using (Stream upload = request.GetRequestStream()) { using (StreamWriter sw = new StreamWriter(upload)) { sw.Write(status); } } using (IHttpResponse response = request.GetResponse()) { logger.Log("Server registration succeeded"); return(true); } } catch (NotHttpException e) { // if this failed, there's nothing much more we can do logger.Log("Server registration failed message " + e.Message + " status " + e.Response.StatusCode + ": " + e.Response.StatusDescription); return(false); } catch (Exception e) { // if this failed, there's nothing much more we can do logger.Log("Server registration failed message " + e.Message); return(false); } }