Esempio n. 1
0
        public List <MessageContainer> Heartbeat(DT.Heartbeat heartbeat)
        {
            RoleVerifier.AuthenticateForAnyRole(HiveRoles.Slave);
            List <MessageContainer> result = new List <MessageContainer>();

            try {
                using (new PerformanceLogger("ProcessHeartbeat")) {
                    result = HeartbeatManager.ProcessHeartbeat(heartbeat);
                }
            }
            catch (Exception ex) {
                DA.LogFactory.GetLogger(this.GetType().Namespace).Log(string.Format("Exception processing Heartbeat: {0}", ex));
            }
            if (HeuristicLab.Services.Hive.Properties.Settings.Default.TriggerEventManagerInHeartbeat)
            {
                TriggerEventManager(false);
            }
            return(result);
        }
    /// <summary>
    /// This method will be called every time a slave sends a heartbeat (-> very often; concurrency is important!)
    /// </summary>
    /// <returns>a list of actions the slave should do</returns>
    public List<MessageContainer> ProcessHeartbeat(Heartbeat heartbeat) {
      List<MessageContainer> actions = new List<MessageContainer>();
      var pm = PersistenceManager;
      var slaveDao = pm.SlaveDao;
      var taskDao = pm.TaskDao;
      var slave = pm.UseTransaction(() => slaveDao.GetById(heartbeat.SlaveId));
      if (slave == null) {
        actions.Add(new MessageContainer(MessageContainer.MessageType.SayHello));
      } else {
        if (heartbeat.HbInterval != slave.HbInterval) {
          actions.Add(new MessageContainer(MessageContainer.MessageType.NewHBInterval));
        }
        if (slaveDao.SlaveHasToShutdownComputer(slave.ResourceId)) {
          actions.Add(new MessageContainer(MessageContainer.MessageType.ShutdownComputer));
        }
        // update slave data  
        slave.FreeCores = heartbeat.FreeCores;
        slave.FreeMemory = heartbeat.FreeMemory;
        slave.CpuUtilization = heartbeat.CpuUtilization;
        slave.SlaveState = (heartbeat.JobProgress != null && heartbeat.JobProgress.Count > 0)
          ? DA.SlaveState.Calculating
          : DA.SlaveState.Idle;
        slave.LastHeartbeat = DateTime.Now;
        pm.UseTransaction(() => {
          slave.IsAllowedToCalculate = slaveDao.SlaveIsAllowedToCalculate(slave.ResourceId);
          pm.SubmitChanges();
        });

        // update task data
        actions.AddRange(UpdateTasks(pm, heartbeat, slave.IsAllowedToCalculate));

        // assign new task
        if (heartbeat.AssignJob && slave.IsAllowedToCalculate && heartbeat.FreeCores > 0) {
          bool mutexAquired = false;
          var mutex = new Mutex(false, MutexName);
          try {
            mutexAquired = mutex.WaitOne(Properties.Settings.Default.SchedulingPatience);
            if (mutexAquired) {
              var waitingTasks = pm.UseTransaction(() => taskDao.GetWaitingTasks(slave)
                  .Select(x => new TaskInfoForScheduler {
                    TaskId = x.TaskId,
                    JobId = x.JobId,
                    Priority = x.Priority
                  })
                  .ToList()
              );
              var availableTasks = TaskScheduler.Schedule(waitingTasks).ToArray();
              if (availableTasks.Any()) {
                var task = availableTasks.First();
                AssignTask(pm, slave, task.TaskId);
                actions.Add(new MessageContainer(MessageContainer.MessageType.CalculateTask, task.TaskId));
              }
            } else {
              LogFactory.GetLogger(this.GetType().Namespace).Log("HeartbeatManager: The mutex used for scheduling could not be aquired.");
            }
          }
          catch (AbandonedMutexException) {
            LogFactory.GetLogger(this.GetType().Namespace).Log("HeartbeatManager: The mutex used for scheduling has been abandoned.");
          }
          catch (Exception ex) {
            LogFactory.GetLogger(this.GetType().Namespace).Log(string.Format("HeartbeatManager threw an exception in ProcessHeartbeat: {0}", ex));
          }
          finally {
            if (mutexAquired) mutex.ReleaseMutex();
          }
        }
      }
      return actions;
    }
    /// <summary>
    /// Update the progress of each task
    /// Checks if all the task sent by heartbeat are supposed to be calculated by this slave
    /// </summary>
    private IEnumerable<MessageContainer> UpdateTasks(IPersistenceManager pm, Heartbeat heartbeat, bool isAllowedToCalculate) {
      var taskDao = pm.TaskDao;
      var assignedResourceDao = pm.AssignedResourceDao;
      var actions = new List<MessageContainer>();
      if (heartbeat.JobProgress == null || !heartbeat.JobProgress.Any())
        return actions;

      if (!isAllowedToCalculate && heartbeat.JobProgress.Count != 0) {
        actions.Add(new MessageContainer(MessageContainer.MessageType.PauseAll));
      } else {
        // select all tasks and statelogs with one query
        var taskIds = heartbeat.JobProgress.Select(x => x.Key).ToList();
        var taskInfos = pm.UseTransaction(() =>
          (from task in taskDao.GetAll()
           where taskIds.Contains(task.TaskId)
           let lastStateLog = task.StateLogs.OrderByDescending(x => x.DateTime).FirstOrDefault()
           select new {
             TaskId = task.TaskId,
             Command = task.Command,
             SlaveId = lastStateLog != null ? lastStateLog.SlaveId : default(Guid)
           }).ToList()
        );

        // process the jobProgresses
        foreach (var jobProgress in heartbeat.JobProgress) {
          var progress = jobProgress;
          var curTask = taskInfos.SingleOrDefault(x => x.TaskId == progress.Key);
          if (curTask == null) {
            actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, progress.Key));
            LogFactory.GetLogger(this.GetType().Namespace).Log("Task on slave " + heartbeat.SlaveId + " does not exist in DB: " + jobProgress.Key);
          } else {
            var slaveId = curTask.SlaveId;
            if (slaveId == Guid.Empty || slaveId != heartbeat.SlaveId) {
              // assigned slave does not match heartbeat
              actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, curTask.TaskId));
              LogFactory.GetLogger(this.GetType().Namespace).Log("The slave " + heartbeat.SlaveId + " is not supposed to calculate task: " + curTask.TaskId);
            } else if (!assignedResourceDao.TaskIsAllowedToBeCalculatedBySlave(curTask.TaskId, heartbeat.SlaveId)) {
              // assigned resources ids of task do not match with slaveId (and parent resourceGroupIds); this might happen when slave is moved to different group
              actions.Add(new MessageContainer(MessageContainer.MessageType.PauseTask, curTask.TaskId));
            } else {
              // update task execution time
              pm.UseTransaction(() => {
                taskDao.UpdateExecutionTime(curTask.TaskId, progress.Value.TotalMilliseconds);
              });
              switch (curTask.Command) {
                case DA.Command.Stop:
                  actions.Add(new MessageContainer(MessageContainer.MessageType.StopTask, curTask.TaskId));
                  break;
                case DA.Command.Pause:
                  actions.Add(new MessageContainer(MessageContainer.MessageType.PauseTask, curTask.TaskId));
                  break;
                case DA.Command.Abort:
                  actions.Add(new MessageContainer(MessageContainer.MessageType.AbortTask, curTask.TaskId));
                  break;
              }
            }
          }
        }
      }
      return actions;
    }