public void OnNext(REEFMessage value) { if (value != null && value.evaluatorControl != null) { Logger.Log(Level.Verbose, "Received a REEFMessage with EvaluatorControl"); Handle(value.evaluatorControl); } }
public void OnNext(REEFMessage value) { if (value != null && value.evaluatorControl != null) { LOGGER.Log(Level.Info, "Received a REEFMessage with EvaluatorControl"); Handle(value.evaluatorControl); } }
public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { lock (_queuedHeartbeats) { // Do not send a heartbeat if Evaluator has already signaled that it was done. if (_isCompletedHeartbeatQueued) { LOGGER.Log(Level.Warning, "Evaluator trying to schedule a heartbeat after a completed heartbeat has already been scheduled or sent."); return; } if (IsEvaluatorStateCompleted(evaluatorHeartbeatProto.evaluator_status.state)) { _isCompletedHeartbeatQueued = true; } if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto)); _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); return; } // NOT during recovery, try to send REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto); try { _observer.OnNext(payload); _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures } catch (Exception e) { if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING) { Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER); } _heartbeatFailures++; _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e); if (_heartbeatFailures >= _maxHeartbeatRetries) { LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures); LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ==========="); ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected)); LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection); _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY; // clean heartbeat failure _heartbeatFailures = 0; } } } }
public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { lock (_queuedHeartbeats) { if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto)); _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); return; } // NOT during recovery, try to send REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto); try { _observer.OnNext(payload); _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures } catch (Exception e) { if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER); } _heartbeatFailures++; _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e); if (_heartbeatFailures >= _maxHeartbeatRetries) { LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures); LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ==========="); ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected)); try { _driverConnection = _evaluatorSettings.EvaluatorInjector.GetInstance <IDriverConnection>(); } catch (Exception ex) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow(ex, Level.Error, "Failed to inject the driver reconnect implementation", LOGGER); } LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection); _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY; // clean heartbeat failure _heartbeatFailures = 0; } } } }