Example #1
0
 public void OnNext(REEFMessage value)
 {
     if (value != null && value.evaluatorControl != null)
     {
         Logger.Log(Level.Verbose, "Received a REEFMessage with EvaluatorControl");
         Handle(value.evaluatorControl);
     }
 }
Example #2
0
 public void OnNext(REEFMessage value)
 {
     if (value != null && value.evaluatorControl != null)
     {
         LOGGER.Log(Level.Info, "Received a REEFMessage with EvaluatorControl");
         Handle(value.evaluatorControl);
     }
 }
Example #3
0
        public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto)
        {
            lock (_queuedHeartbeats)
            {
                // Do not send a heartbeat if Evaluator has already signaled that it was done.
                if (_isCompletedHeartbeatQueued)
                {
                    LOGGER.Log(Level.Warning, "Evaluator trying to schedule a heartbeat after a completed heartbeat has already been scheduled or sent.");
                    return;
                }

                if (IsEvaluatorStateCompleted(evaluatorHeartbeatProto.evaluator_status.state))
                {
                    _isCompletedHeartbeatQueued = true;
                }

                if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY)
                {
                    LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto));
                    _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto);
                    return;
                }

                // NOT during recovery, try to send
                REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto);
                try
                {
                    _observer.OnNext(payload);
                    _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures
                }
                catch (Exception e)
                {
                    if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING)
                    {
                        Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER);
                    }

                    _heartbeatFailures++;

                    _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto);
                    LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e);

                    if (_heartbeatFailures >= _maxHeartbeatRetries)
                    {
                        LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures);
                        LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ===========");
                        ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected));

                        LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection);
                        _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY;

                        // clean heartbeat failure
                        _heartbeatFailures = 0;
                    }
                }
            }
        }
Example #4
0
        public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto)
        {
            lock (_queuedHeartbeats)
            {
                if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY)
                {
                    LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto));
                    _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto);
                    return;
                }

                // NOT during recovery, try to send
                REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto);
                try
                {
                    _observer.OnNext(payload);
                    _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures
                }
                catch (Exception e)
                {
                    if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING)
                    {
                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER);
                    }

                    _heartbeatFailures++;

                    _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto);
                    LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e);

                    if (_heartbeatFailures >= _maxHeartbeatRetries)
                    {
                        LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures);
                        LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ===========");
                        ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected));

                        try
                        {
                            _driverConnection = _evaluatorSettings.EvaluatorInjector.GetInstance <IDriverConnection>();
                        }
                        catch (Exception ex)
                        {
                            Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow(ex, Level.Error, "Failed to inject the driver reconnect implementation", LOGGER);
                        }
                        LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection);
                        _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY;

                        // clean heartbeat failure
                        _heartbeatFailures = 0;
                    }
                }
            }
        }