public static async Task <int> RunExperimentsAsync(List <ExperimentalUnit> experiments, List <HerdAgentInfo> freeHerdAgents , Monitoring.MsgDispatcher dispatcher , CancellationTokenSource cancellationTokenSource , JobDispatcherOptions jobDispatcherOptions = null) { List <Job> assignedJobs = new List <Job>(); List <Task <Job> > monitoredJobTasks = new List <Task <Job> >(); int numExperimentalUnitsRun = 0; // Calculate run-time requirements foreach (ExperimentalUnit experimentalUnit in experiments) { experimentalUnit.RequestRuntimeRequirements(); } // Assign experiments to free agents AssignExperiments(ref experiments, ref freeHerdAgents, ref assignedJobs, jobDispatcherOptions); if (assignedJobs.Count == 0) { return(0); } try { while ((assignedJobs.Count > 0 || monitoredJobTasks.Count > 0 || experiments.Count > 0) && !cancellationTokenSource.IsCancellationRequested) { //Create view-models for the jobs and execute them remotely foreach (Job job in assignedJobs) { dispatcher.JobAssigned?.Invoke(job); monitoredJobTasks.Add(job.SendJobAndMonitor(dispatcher)); } // All pending experiments sent? Then we await completion to retry in case something fails if (experiments.Count == 0) { Task.WhenAll(monitoredJobTasks).Wait(); foreach (Task <Job> task in monitoredJobTasks) { dispatcher.JobFinished?.Invoke(task.Result); } dispatcher.Log?.Invoke("All the experiments have finished"); break; } // Wait for the first agent to finish and give it something to do Task <Job> finishedTask = await Task.WhenAny(monitoredJobTasks); Job finishedJob = await finishedTask; dispatcher.Log?.Invoke("Job finished: " + finishedJob.ToString()); //A job finished monitoredJobTasks.Remove(finishedTask); if (!cancellationTokenSource.IsCancellationRequested) { numExperimentalUnitsRun += finishedJob.ExperimentalUnits.Count; dispatcher.JobFinished?.Invoke(finishedJob); } if (finishedJob.FailedExperimentalUnits.Count > 0) { experiments.AddRange(finishedJob.FailedExperimentalUnits); dispatcher.Log?.Invoke(finishedJob.FailedExperimentalUnits.Count + " failed experiments enqueued again for further trials"); } // Add the herd agent to the free agent list if (!freeHerdAgents.Contains(finishedJob.HerdAgent)) { freeHerdAgents.Add(finishedJob.HerdAgent); } // Assign experiments to free agents if (!cancellationTokenSource.IsCancellationRequested) { AssignExperiments(ref experiments, ref freeHerdAgents, ref assignedJobs, jobDispatcherOptions); } } } catch (Exception ex) { dispatcher.Log?.Invoke("Exception in runExperimentQueueRemotely()"); dispatcher.Log?.Invoke(ex.StackTrace); } finally { if (cancellationTokenSource.IsCancellationRequested) { //the user cancelled, need to add unfinished experimental units to the pending list foreach (Job job in assignedJobs) { experiments.AddRange(job.ExperimentalUnits); } } else { foreach (Task <Job> job in monitoredJobTasks) { numExperimentalUnitsRun += job.Result.ExperimentalUnits.Count - job.Result.FailedExperimentalUnits.Count; } } } return(numExperimentalUnitsRun); }
/// <summary> /// Async method that sends the job to the herd agent and monitors its evolution, using the /// callback funciton in dispatcher to notify the client of any event /// </summary> /// <param name="dispatcher">The event dispatching data</param> /// <returns>An awaitable task that returns the finished job</returns> public async Task <Job> SendJobAndMonitor(Monitoring.MsgDispatcher dispatcher) { Shepherd shepherd = new Shepherd(); shepherd.SetLogMessageHandler(dispatcher.Log); try { PrepareForExecution(); // compute inputs/outputs... //Let the dispatcher know which experimental units were created foreach (ExperimentalUnit expUnit in ExperimentalUnits) { dispatcher.ExperimentalUnitLaunched?.Invoke(this, expUnit); } bool bConnected = shepherd.ConnectToHerdAgent(HerdAgent.ipAddress); if (bConnected) { dispatcher.Log?.Invoke("Sending job to herd agent " + HerdAgent.ipAddress); dispatcher.AllStatesChanged?.Invoke(this, Monitoring.State.SENDING); shepherd.SendJobQuery(this, dispatcher.CancelToken); dispatcher.Log?.Invoke("Job sent to herd agent " + HerdAgent.ipAddress); dispatcher.AllStatesChanged(this, Monitoring.State.RUNNING); } else { ///SOLVE THIS: failed experimental units should be dealt by the caller /// foreach (ExperimentalUnit exp in ExperimentalUnits) { FailedExperimentalUnits.Add(exp); } dispatcher.AllStatesChanged?.Invoke(this, Monitoring.State.ERROR); dispatcher.Log?.Invoke("Failed to connect to herd agent " + HerdAgent.ipAddress); return(this); } dispatcher.Log?.Invoke("Monitoring remote job run by herd agent " + HerdAgent.ipAddress); // Monitor the remote job while (true) { int numBytesRead = await shepherd.ReadAsync(dispatcher.CancelToken); dispatcher.CancelToken.ThrowIfCancellationRequested(); string xmlItem = shepherd.m_xmlStream.processNextXMLItem(); while (xmlItem != "") { string experimentId = shepherd.m_xmlStream.getLastXMLItemTag(); string message = shepherd.m_xmlStream.getLastXMLItemContent(); string messageId = shepherd.m_xmlStream.getLastXMLItemTag(); //previous call to getLastXMLItemContent resets lastXMLItem string messageContent = shepherd.m_xmlStream.getLastXMLItemContent(); if (experimentId == XMLStream.m_defaultMessageType) { //The message comes from the herd agent, must be sending results dispatcher.Log?.Invoke("Receiving job results"); dispatcher.AllStatesChanged?.Invoke(this, Monitoring.State.RECEIVING); bool bret = await shepherd.ReceiveJobResult(dispatcher.CancelToken); dispatcher.AllStatesChanged?.Invoke(this, Monitoring.State.FINISHED); m_bFinished = true; dispatcher.Log?.Invoke("Job results received"); return(this); } else //the message comes from an experimental unit, the dispatcher will deal with it { dispatcher.MessageReceived?.Invoke(this, experimentId, messageId, messageContent); } xmlItem = shepherd.m_xmlStream.processNextXMLItem(); } } } catch (OperationCanceledException) { //quit remote jobs dispatcher.Log?.Invoke("Cancellation requested by user"); shepherd.WriteMessage(Shepherd.m_quitMessage, true); await shepherd.ReadAsync(new CancellationToken()); //we synchronously wait until we get the ack from the client m_bCancelled = true; } catch (Exception ex) { dispatcher.Log?.Invoke("Unhandled exception in Badger.sendJobAndMonitor(). Agent " + HerdAgent.ipAddress); dispatcher.Log?.Invoke(ex.ToString()); FailedExperimentalUnits.Clear(); FailedExperimentalUnits.AddRange(ExperimentalUnits); } finally { dispatcher.Log?.Invoke("Disconnected from herd agent " + HerdAgent.ipAddress); shepherd.Disconnect(); } return(this); }