public static EvaluatorHeartbeatProto Deserialize(byte[] bytes) { EvaluatorHeartbeatProto pbuf = null; using (var s = new MemoryStream(bytes)) { pbuf = Serializer.Deserialize <EvaluatorHeartbeatProto>(s); } return(pbuf); }
public REEFMessage(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { _evaluatorHeartBeat = evaluatorHeartbeatProto; }
private EvaluatorHeartbeatProto GetEvaluatorHeartbeatProto( EvaluatorStatusProto evaluatorStatusProto, ICollection<ContextStatusProto> contextStatusProtos, Optional<TaskStatusProto> taskStatusProto) { EvaluatorHeartbeatProto evaluatorHeartbeatProto = new EvaluatorHeartbeatProto() { timestamp = CurrentTimeMilliSeconds(), evaluator_status = evaluatorStatusProto }; foreach (ContextStatusProto contextStatusProto in contextStatusProtos) { evaluatorHeartbeatProto.context_status.Add(contextStatusProto); } if (taskStatusProto.IsPresent()) { evaluatorHeartbeatProto.task_status = taskStatusProto.Value; } return evaluatorHeartbeatProto; }
private EvaluatorHeartbeatProto ConstructRecoveryHeartBeat(EvaluatorHeartbeatProto heartbeat) { heartbeat.recovery = true; heartbeat.context_status.ForEach(c => c.recovery = true); heartbeat.task_status.recovery = true; return heartbeat; }
/// <summary> /// Called with a specific EvaluatorStatus that must be delivered to the driver /// </summary> /// <param name="evaluatorStatusProto"></param> public void OnNext(EvaluatorStatusProto evaluatorStatusProto) { LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext(EvaluatorStatusProto)"); lock (this) { LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(EvaluatorStatusProto)"); EvaluatorHeartbeatProto heartbeatProto = new EvaluatorHeartbeatProto() { timestamp = CurrentTimeMilliSeconds(), evaluator_status = evaluatorStatusProto }; LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto)); Send(heartbeatProto); } }
public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { lock (_queuedHeartbeats) { if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto)); _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); return; } // NOT during recovery, try to send REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto); try { _observer.OnNext(payload); _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures } catch (Exception e) { if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING) { Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER); } _heartbeatFailures++; _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e); if (_heartbeatFailures >= _maxHeartbeatRetries) { LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures); LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ==========="); ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected)); try { _driverConnection = _evaluatorSettings.EvaluatorInjector.GetInstance<IDriverConnection>(); } catch (Exception ex) { Utilities.Diagnostics.Exceptions.CaughtAndThrow(ex, Level.Error, "Failed to inject the driver reconnect implementation", LOGGER); } LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection); _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY; // clean heartbeat failure _heartbeatFailures = 0; } } } }
public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { lock (_queuedHeartbeats) { // Do not send a heartbeat if Evaluator has already signaled that it was done. if (_isCompletedHeartbeatQueued) { LOGGER.Log(Level.Warning, "Evaluator trying to schedule a heartbeat after a completed heartbeat has already been scheduled or sent."); return; } if (IsEvaluatorStateCompleted(evaluatorHeartbeatProto.evaluator_status.state)) { _isCompletedHeartbeatQueued = true; } if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto)); _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); return; } // NOT during recovery, try to send REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto); try { _observer.OnNext(payload); _heartbeatFailures = 0; // reset failure counts if we are having intermittent (not continuous) failures } catch (Exception e) { if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING) { Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER); } _heartbeatFailures++; _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e); if (_driverConnection.Get() is MissingDriverConnection) { if (_heartbeatFailures >= _maxHeartbeatRetriesForNonRecoveryMode) { var msg = string.Format(CultureInfo.InvariantCulture, "Have encountered {0} heartbeat failures. Limit of heartbeat sending failures exceeded. Driver reconnect logic is not implemented, failing evaluator.", _heartbeatFailures); LOGGER.Log(Level.Error, msg); throw new ReefRuntimeException(msg, e); } } else { if (_heartbeatFailures >= _maxHeartbeatRetries) { LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures); LOGGER.Log(Level.Info, "Entering RECOVERY mode!!!"); ContextManager.HandleDriverConnectionMessage( new DriverConnectionMessageImpl(DriverConnectionState.Disconnected)); LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection); _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY; // clean heartbeat failure _heartbeatFailures = 0; } } } } }