private EvaluatorHeartbeatProto ConstructRecoveryHeartBeat(EvaluatorHeartbeatProto heartbeat) { heartbeat.recovery = true; heartbeat.context_status.ForEach(c => c.recovery = true); heartbeat.task_status.recovery = true; return(heartbeat); }
/// <summary> /// Receives and routes heartbeats from Evaluators. /// </summary> /// <param name="evaluatorHearBeatProto"></param> private void Handle(IRemoteMessage <EvaluatorHeartbeatProto> evaluatorHearBeatProto) { EvaluatorHeartbeatProto heartbeat = evaluatorHearBeatProto.Message; EvaluatorStatusProto status = heartbeat.evaluator_status; string evaluatorId = status.evaluator_id; LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Heartbeat from Evaluator {0} with state {1} timestamp {2}", evaluatorId, status.state, heartbeat.timestamp)); _sanityChecker.check(evaluatorId, heartbeat.timestamp); lock (_evaluators) { if (_evaluators.ContainsKey(evaluatorId)) { EvaluatorManager evaluatorManager = _evaluators[evaluatorId]; evaluatorManager.Handle(evaluatorHearBeatProto); } else { string msg = "Contact from unkonwn evaluator with id: " + evaluatorId; if (heartbeat.evaluator_status != null) { msg += " with state" + status.state; } LOGGER.Log(Level.Error, msg); Exceptions.Throw(new InvalidOperationException(msg), LOGGER); } } }
private void Recover(DriverInformation driverInformation) { IPEndPoint driverEndpoint = NetUtilities.ParseIpEndpoint(driverInformation.DriverRemoteIdentifier); _remoteId = new SocketRemoteIdentifier(driverEndpoint); _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint <REEFMessage>(_remoteId)); lock (_evaluatorSettings) { if (_evaluatorSettings.NameClient != null) { try { LOGGER.Log(Level.Verbose, "Trying to reset and reconnect to name server" + driverInformation.NameServerId); _evaluatorSettings.NameClient.Restart(NetUtilities.ParseIpEndpoint(driverInformation.NameServerId)); LOGGER.Log(Level.Info, "Reconnected to name server: " + driverInformation.NameServerId); } catch (Exception e) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Error, LOGGER); } } } lock (_queuedHeartbeats) { bool firstHeartbeatInQueue = true; while (_queuedHeartbeats.Any()) { LOGGER.Log(Level.Info, "Sending cached recovery heartbeats to " + _remoteId); try { if (firstHeartbeatInQueue) { // first heartbeat is specially construted to include the recovery flag EvaluatorHeartbeatProto recoveryHeartbeat = ConstructRecoveryHeartBeat(_queuedHeartbeats.Dequeue()); LOGGER.Log(Level.Info, "Recovery heartbeat to be sent:" + recoveryHeartbeat); _observer.OnNext(new REEFMessage(recoveryHeartbeat)); firstHeartbeatInQueue = false; } else { _observer.OnNext(new REEFMessage(_queuedHeartbeats.Dequeue())); } } catch (Exception e) { // we do not handle failures during RECOVERY Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow( e, Level.Error, string.Format(CultureInfo.InvariantCulture, "Hearbeat attempt failed in RECOVERY mode to Driver {0} , giving up...", _remoteId), LOGGER); } Thread.Sleep(500); } } _evaluatorSettings.OperationState = EvaluatorOperationState.OPERATIONAL; LOGGER.Log(Level.Info, "=========== Exiting RECOVERY mode. ==========="); }
public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { lock (_queuedHeartbeats) { // Do not send a heartbeat if Evaluator has already signaled that it was done. if (_isCompletedHeartbeatQueued) { LOGGER.Log(Level.Warning, "Evaluator trying to schedule a heartbeat after a completed heartbeat has already been scheduled or sent."); return; } if (IsEvaluatorStateCompleted(evaluatorHeartbeatProto.evaluator_status.state)) { _isCompletedHeartbeatQueued = true; } if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto)); _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); return; } // NOT during recovery, try to send REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto); try { _observer.OnNext(payload); _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures } catch (Exception e) { if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING) { Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER); } _heartbeatFailures++; _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e); if (_heartbeatFailures >= _maxHeartbeatRetries) { LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures); LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ==========="); ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected)); LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection); _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY; // clean heartbeat failure _heartbeatFailures = 0; } } } }
/// <summary> /// Assemble a complete new heartbeat and send it out. /// </summary> public void OnNext() { LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext()"); lock (this) { LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext()"); EvaluatorHeartbeatProto heartbeatProto = GetEvaluatorHeartbeatProto(); LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto)); Send(heartbeatProto); } }
public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { lock (_queuedHeartbeats) { if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto)); _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); return; } // NOT during recovery, try to send REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto); try { _observer.OnNext(payload); _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures } catch (Exception e) { if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER); } _heartbeatFailures++; _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e); if (_heartbeatFailures >= _maxHeartbeatRetries) { LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures); LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ==========="); ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected)); try { _driverConnection = _evaluatorSettings.EvaluatorInjector.GetInstance <IDriverConnection>(); } catch (Exception ex) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow(ex, Level.Error, "Failed to inject the driver reconnect implementation", LOGGER); } LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection); _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY; // clean heartbeat failure _heartbeatFailures = 0; } } } }
public void OnNext(Alarm value) { LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext(Alarm)"); lock (this) { LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(Alarm)"); if (_evaluatorSettings.OperationState == EvaluatorOperationState.OPERATIONAL && EvaluatorRuntime.State == State.RUNNING) { EvaluatorHeartbeatProto evaluatorHeartbeatProto = GetEvaluatorHeartbeatProto(); LOGGER.Log(Level.Verbose, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}. {1}Node Health: {2}", evaluatorHeartbeatProto, Environment.NewLine, MachineStatus.ToString())); Send(evaluatorHeartbeatProto); } else { LOGGER.Log(Level.Verbose, "Ignoring regular heartbeat since Evaluator operation state is [{0}] and runtime state is [{1}]. ", EvaluatorSettings.OperationState, EvaluatorRuntime.State); // Do not try to recover if Evaluator is done. if (IsEvaluatorStateCompleted(EvaluatorRuntime.State)) { return; } if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { var driverConnection = _driverConnection.Get(); try { var driverInformation = driverConnection.GetDriverInformation(); if (driverInformation == null) { LOGGER.Log(Level.Verbose, "In RECOVERY mode, cannot retrieve driver information, will try again later."); } else { LOGGER.Log( Level.Info, string.Format(CultureInfo.InvariantCulture, "Detect driver restarted at {0} and is running on endpoint {1} with services {2}. Now trying to re-establish connection", driverInformation.DriverStartTime, driverInformation.DriverRemoteIdentifier, driverInformation.NameServerId)); Recover(driverInformation); } } catch (Exception e) { // we do not want any exception to stop the query for driver status Utilities.Diagnostics.Exceptions.Caught(e, Level.Warning, LOGGER); } } } _clock.ScheduleAlarm(_heartBeatPeriodInMillSeconds, this); } }
/// <summary> /// Called with a specific TaskStatus that must be delivered to the driver /// </summary> /// <param name="taskStatusProto"></param> public void OnNext(TaskStatusProto taskStatusProto) { LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext(TaskStatusProto)"); lock (this) { LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(TaskStatusProto)"); EvaluatorHeartbeatProto heartbeatProto = GetEvaluatorHeartbeatProto( EvaluatorRuntime.GetEvaluatorStatus(), ContextManager.GetContextStatusCollection(), Optional <TaskStatusProto> .Of(taskStatusProto)); LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto)); Send(heartbeatProto); } }
/// <summary> /// Called with a specific EvaluatorStatus that must be delivered to the driver /// </summary> /// <param name="evaluatorStatusProto"></param> public void OnNext(EvaluatorStatusProto evaluatorStatusProto) { LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext(EvaluatorStatusProto)"); lock (this) { LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(EvaluatorStatusProto)"); EvaluatorHeartbeatProto heartbeatProto = new EvaluatorHeartbeatProto() { timestamp = CurrentTimeMilliSeconds(), evaluator_status = evaluatorStatusProto }; LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto)); Send(heartbeatProto); } }
/// <summary> /// Called with a specific ContextStatusProto that must be delivered to the driver /// </summary> /// <param name="contextStatusProto"></param> public void OnNext(ContextStatusProto contextStatusProto) { LOGGER.Log(Level.Verbose, "Before aqcuiring lock: HeartbeatManager::OnNext(ContextStatusProto)"); lock (this) { LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(ContextStatusProto)"); List <ContextStatusProto> contextStatusProtos = new List <ContextStatusProto>(); contextStatusProtos.Add(contextStatusProto); contextStatusProtos.AddRange(_contextManager.GetContextStatusCollection()); EvaluatorHeartbeatProto heartbeatProto = GetEvaluatorHeartbeatProto( _evaluatorRuntime.GetEvaluatorStatus(), contextStatusProtos, Optional <TaskStatusProto> .Empty()); LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto)); Send(heartbeatProto); } }
private EvaluatorHeartbeatProto GetEvaluatorHeartbeatProto( EvaluatorStatusProto evaluatorStatusProto, ICollection <ContextStatusProto> contextStatusProtos, Optional <TaskStatusProto> taskStatusProto) { EvaluatorHeartbeatProto evaluatorHeartbeatProto = new EvaluatorHeartbeatProto() { timestamp = CurrentTimeMilliSeconds(), evaluator_status = evaluatorStatusProto }; foreach (ContextStatusProto contextStatusProto in contextStatusProtos) { evaluatorHeartbeatProto.context_status.Add(contextStatusProto); } if (taskStatusProto.IsPresent()) { evaluatorHeartbeatProto.task_status = taskStatusProto.Value; } return(evaluatorHeartbeatProto); }
public void Handle(IRemoteMessage <EvaluatorHeartbeatProto> evaluatorHearBeatProtoMessage) { lock (_evaluatorDescriptor) { EvaluatorHeartbeatProto heartbeatProto = evaluatorHearBeatProtoMessage.Message; if (heartbeatProto.evaluator_status != null) { EvaluatorStatusProto status = heartbeatProto.evaluator_status; if (status.error != null) { Handle(new EvaluatorException(Id, ByteUtilities.ByteArrarysToString(status.error))); return; } else if (_state == STATE.SUBMITTED) { string evaluatorRId = evaluatorHearBeatProtoMessage.Identifier.ToString(); LOGGER.Log(Level.Info, "TODO: REPLACE THIS " + evaluatorRId); // TODO // _evaluatorControlHandler = _remoteManager.getHandler(evaluatorRID, EvaluatorRuntimeProtocol.EvaluatorControlProto.class); _state = STATE.RUNNING; LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Evaluator {0} is running", _evaluatorId)); } } LOGGER.Log(Level.Info, "Evaluator heartbeat: " + heartbeatProto); EvaluatorStatusProto evaluatorStatusProto = heartbeatProto.evaluator_status; foreach (ContextStatusProto contextStatusProto in heartbeatProto.context_status) { Handle(contextStatusProto, heartbeatProto.task_status != null); } if (heartbeatProto.task_status != null) { Handle(heartbeatProto.task_status); } if (evaluatorStatusProto.state == State.FAILED) { _state = STATE.FAILED; EvaluatorException e = evaluatorStatusProto.error != null ? new EvaluatorException(_evaluatorId, ByteUtilities.ByteArrarysToString(evaluatorStatusProto.error)) : new EvaluatorException(_evaluatorId, "unknown cause"); LOGGER.Log(Level.Warning, "Failed evaluator: " + Id + e.Message); Handle(e); } else if (evaluatorStatusProto.state == State.DONE) { LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Evaluator {0} done", Id)); _state = STATE.DONE; // TODO // dispatcher.onNext(CompletedEvaluator.class, new CompletedEvaluator() { //@Override //public String getId() { // return EvaluatorManager.this.evaluatorId; Dispose(); } } LOGGER.Log(Level.Info, "DONE with evaluator heartbeat"); }
public REEFMessage(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { _evaluatorHeartBeat = evaluatorHeartbeatProto; }