public List <MessageContainer> Heartbeat(DT.Heartbeat heartbeat) { RoleVerifier.AuthenticateForAnyRole(HiveRoles.Slave); List <MessageContainer> result = new List <MessageContainer>(); try { using (new PerformanceLogger("ProcessHeartbeat")) { result = HeartbeatManager.ProcessHeartbeat(heartbeat); } } catch (Exception ex) { DA.LogFactory.GetLogger(this.GetType().Namespace).Log(string.Format("Exception processing Heartbeat: {0}", ex)); } if (HeuristicLab.Services.Hive.Properties.Settings.Default.TriggerEventManagerInHeartbeat) { TriggerEventManager(false); } return(result); }
/// <summary> /// This method will be called every time a slave sends a heartbeat (-> very often; concurrency is important!) /// </summary> /// <returns>a list of actions the slave should do</returns> public List<MessageContainer> ProcessHeartbeat(Heartbeat heartbeat) { List<MessageContainer> actions = new List<MessageContainer>(); var pm = PersistenceManager; var slaveDao = pm.SlaveDao; var taskDao = pm.TaskDao; var slave = pm.UseTransaction(() => slaveDao.GetById(heartbeat.SlaveId)); if (slave == null) { actions.Add(new MessageContainer(MessageContainer.MessageType.SayHello)); } else { if (heartbeat.HbInterval != slave.HbInterval) { actions.Add(new MessageContainer(MessageContainer.MessageType.NewHBInterval)); } if (slaveDao.SlaveHasToShutdownComputer(slave.ResourceId)) { actions.Add(new MessageContainer(MessageContainer.MessageType.ShutdownComputer)); } // update slave data slave.FreeCores = heartbeat.FreeCores; slave.FreeMemory = heartbeat.FreeMemory; slave.CpuUtilization = heartbeat.CpuUtilization; slave.SlaveState = (heartbeat.JobProgress != null && heartbeat.JobProgress.Count > 0) ? DA.SlaveState.Calculating : DA.SlaveState.Idle; slave.LastHeartbeat = DateTime.Now; pm.UseTransaction(() => { slave.IsAllowedToCalculate = slaveDao.SlaveIsAllowedToCalculate(slave.ResourceId); pm.SubmitChanges(); }); // update task data actions.AddRange(UpdateTasks(pm, heartbeat, slave.IsAllowedToCalculate)); // assign new task if (heartbeat.AssignJob && slave.IsAllowedToCalculate && heartbeat.FreeCores > 0) { bool mutexAquired = false; var mutex = new Mutex(false, MutexName); try { mutexAquired = mutex.WaitOne(Properties.Settings.Default.SchedulingPatience); if (mutexAquired) { var waitingTasks = pm.UseTransaction(() => taskDao.GetWaitingTasks(slave) .Select(x => new TaskInfoForScheduler { TaskId = x.TaskId, JobId = x.JobId, Priority = x.Priority }) .ToList() ); var availableTasks = TaskScheduler.Schedule(waitingTasks).ToArray(); if (availableTasks.Any()) { var task = availableTasks.First(); AssignTask(pm, slave, task.TaskId); actions.Add(new MessageContainer(MessageContainer.MessageType.CalculateTask, task.TaskId)); } } else { LogFactory.GetLogger(this.GetType().Namespace).Log("HeartbeatManager: The mutex used for scheduling could not be aquired."); } } catch (AbandonedMutexException) { LogFactory.GetLogger(this.GetType().Namespace).Log("HeartbeatManager: The mutex used for scheduling has been abandoned."); } catch (Exception ex) { LogFactory.GetLogger(this.GetType().Namespace).Log(string.Format("HeartbeatManager threw an exception in ProcessHeartbeat: {0}", ex)); } finally { if (mutexAquired) mutex.ReleaseMutex(); } } } return actions; }
/// <summary> /// Update the progress of each task /// Checks if all the task sent by heartbeat are supposed to be calculated by this slave /// </summary> private IEnumerable<MessageContainer> UpdateTasks(IPersistenceManager pm, Heartbeat heartbeat, bool isAllowedToCalculate) { var taskDao = pm.TaskDao; var assignedResourceDao = pm.AssignedResourceDao; var actions = new List<MessageContainer>(); if (heartbeat.JobProgress == null || !heartbeat.JobProgress.Any()) return actions; if (!isAllowedToCalculate && heartbeat.JobProgress.Count != 0) { actions.Add(new MessageContainer(MessageContainer.MessageType.PauseAll)); } else { // select all tasks and statelogs with one query var taskIds = heartbeat.JobProgress.Select(x => x.Key).ToList(); var taskInfos = pm.UseTransaction(() => (from task in taskDao.GetAll() where taskIds.Contains(task.TaskId) let lastStateLog = task.StateLogs.OrderByDescending(x => x.DateTime).FirstOrDefault() select new { TaskId = task.TaskId, Command = task.Command, SlaveId = lastStateLog != null ? lastStateLog.SlaveId : default(Guid) }).ToList() ); // process the jobProgresses foreach (var jobProgress in heartbeat.JobProgress) { var progress = jobProgress; var curTask = taskInfos.SingleOrDefault(x => x.TaskId == progress.Key); if (curTask == null) { actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, progress.Key)); LogFactory.GetLogger(this.GetType().Namespace).Log("Task on slave " + heartbeat.SlaveId + " does not exist in DB: " + jobProgress.Key); } else { var slaveId = curTask.SlaveId; if (slaveId == Guid.Empty || slaveId != heartbeat.SlaveId) { // assigned slave does not match heartbeat actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, curTask.TaskId)); LogFactory.GetLogger(this.GetType().Namespace).Log("The slave " + heartbeat.SlaveId + " is not supposed to calculate task: " + curTask.TaskId); } else if (!assignedResourceDao.TaskIsAllowedToBeCalculatedBySlave(curTask.TaskId, heartbeat.SlaveId)) { // assigned resources ids of task do not match with slaveId (and parent resourceGroupIds); this might happen when slave is moved to different group actions.Add(new MessageContainer(MessageContainer.MessageType.PauseTask, curTask.TaskId)); } else { // update task execution time pm.UseTransaction(() => { taskDao.UpdateExecutionTime(curTask.TaskId, progress.Value.TotalMilliseconds); }); switch (curTask.Command) { case DA.Command.Stop: actions.Add(new MessageContainer(MessageContainer.MessageType.StopTask, curTask.TaskId)); break; case DA.Command.Pause: actions.Add(new MessageContainer(MessageContainer.MessageType.PauseTask, curTask.TaskId)); break; case DA.Command.Abort: actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, curTask.TaskId)); break; } } } } } return actions; }