Пример #1
0
 private EvaluatorHeartbeatProto ConstructRecoveryHeartBeat(EvaluatorHeartbeatProto heartbeat)
 {
     heartbeat.recovery = true;
     heartbeat.context_status.ForEach(c => c.recovery = true);
     heartbeat.task_status.recovery = true;
     return(heartbeat);
 }
Пример #2
0
        /// <summary>
        ///  Receives and routes heartbeats from Evaluators.
        /// </summary>
        /// <param name="evaluatorHearBeatProto"></param>
        private void Handle(IRemoteMessage <EvaluatorHeartbeatProto> evaluatorHearBeatProto)
        {
            EvaluatorHeartbeatProto heartbeat = evaluatorHearBeatProto.Message;
            EvaluatorStatusProto    status    = heartbeat.evaluator_status;
            string evaluatorId = status.evaluator_id;

            LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Heartbeat from Evaluator {0} with state {1} timestamp {2}", evaluatorId, status.state, heartbeat.timestamp));
            _sanityChecker.check(evaluatorId, heartbeat.timestamp);

            lock (_evaluators)
            {
                if (_evaluators.ContainsKey(evaluatorId))
                {
                    EvaluatorManager evaluatorManager = _evaluators[evaluatorId];
                    evaluatorManager.Handle(evaluatorHearBeatProto);
                }
                else
                {
                    string msg = "Contact from unkonwn evaluator with id: " + evaluatorId;
                    if (heartbeat.evaluator_status != null)
                    {
                        msg += " with state" + status.state;
                    }
                    LOGGER.Log(Level.Error, msg);
                    Exceptions.Throw(new InvalidOperationException(msg), LOGGER);
                }
            }
        }
Пример #3
0
        private void Recover(DriverInformation driverInformation)
        {
            IPEndPoint driverEndpoint = NetUtilities.ParseIpEndpoint(driverInformation.DriverRemoteIdentifier);

            _remoteId = new SocketRemoteIdentifier(driverEndpoint);
            _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint <REEFMessage>(_remoteId));
            lock (_evaluatorSettings)
            {
                if (_evaluatorSettings.NameClient != null)
                {
                    try
                    {
                        LOGGER.Log(Level.Verbose, "Trying to reset and reconnect to name server" + driverInformation.NameServerId);
                        _evaluatorSettings.NameClient.Restart(NetUtilities.ParseIpEndpoint(driverInformation.NameServerId));
                        LOGGER.Log(Level.Info, "Reconnected to name server: " + driverInformation.NameServerId);
                    }
                    catch (Exception e)
                    {
                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Error, LOGGER);
                    }
                }
            }

            lock (_queuedHeartbeats)
            {
                bool firstHeartbeatInQueue = true;
                while (_queuedHeartbeats.Any())
                {
                    LOGGER.Log(Level.Info, "Sending cached recovery heartbeats to " + _remoteId);
                    try
                    {
                        if (firstHeartbeatInQueue)
                        {
                            // first heartbeat is specially construted to include the recovery flag
                            EvaluatorHeartbeatProto recoveryHeartbeat = ConstructRecoveryHeartBeat(_queuedHeartbeats.Dequeue());
                            LOGGER.Log(Level.Info, "Recovery heartbeat to be sent:" + recoveryHeartbeat);
                            _observer.OnNext(new REEFMessage(recoveryHeartbeat));
                            firstHeartbeatInQueue = false;
                        }
                        else
                        {
                            _observer.OnNext(new REEFMessage(_queuedHeartbeats.Dequeue()));
                        }
                    }
                    catch (Exception e)
                    {
                        // we do not handle failures during RECOVERY
                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow(
                            e,
                            Level.Error,
                            string.Format(CultureInfo.InvariantCulture, "Hearbeat attempt failed in RECOVERY mode to Driver {0} , giving up...", _remoteId),
                            LOGGER);
                    }
                    Thread.Sleep(500);
                }
            }
            _evaluatorSettings.OperationState = EvaluatorOperationState.OPERATIONAL;
            LOGGER.Log(Level.Info, "=========== Exiting RECOVERY mode. ===========");
        }
Пример #4
0
        public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto)
        {
            lock (_queuedHeartbeats)
            {
                // Do not send a heartbeat if Evaluator has already signaled that it was done.
                if (_isCompletedHeartbeatQueued)
                {
                    LOGGER.Log(Level.Warning, "Evaluator trying to schedule a heartbeat after a completed heartbeat has already been scheduled or sent.");
                    return;
                }

                if (IsEvaluatorStateCompleted(evaluatorHeartbeatProto.evaluator_status.state))
                {
                    _isCompletedHeartbeatQueued = true;
                }

                if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY)
                {
                    LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto));
                    _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto);
                    return;
                }

                // NOT during recovery, try to send
                REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto);
                try
                {
                    _observer.OnNext(payload);
                    _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures
                }
                catch (Exception e)
                {
                    if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING)
                    {
                        Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER);
                    }

                    _heartbeatFailures++;

                    _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto);
                    LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e);

                    if (_heartbeatFailures >= _maxHeartbeatRetries)
                    {
                        LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures);
                        LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ===========");
                        ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected));

                        LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection);
                        _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY;

                        // clean heartbeat failure
                        _heartbeatFailures = 0;
                    }
                }
            }
        }
Пример #5
0
 /// <summary>
 /// Assemble a complete new heartbeat and send it out.
 /// </summary>
 public void OnNext()
 {
     LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext()");
     lock (this)
     {
         LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext()");
         EvaluatorHeartbeatProto heartbeatProto = GetEvaluatorHeartbeatProto();
         LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto));
         Send(heartbeatProto);
     }
 }
Пример #6
0
        public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto)
        {
            lock (_queuedHeartbeats)
            {
                if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY)
                {
                    LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto));
                    _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto);
                    return;
                }

                // NOT during recovery, try to send
                REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto);
                try
                {
                    _observer.OnNext(payload);
                    _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures
                }
                catch (Exception e)
                {
                    if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING)
                    {
                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER);
                    }

                    _heartbeatFailures++;

                    _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto);
                    LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e);

                    if (_heartbeatFailures >= _maxHeartbeatRetries)
                    {
                        LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures);
                        LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ===========");
                        ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected));

                        try
                        {
                            _driverConnection = _evaluatorSettings.EvaluatorInjector.GetInstance <IDriverConnection>();
                        }
                        catch (Exception ex)
                        {
                            Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow(ex, Level.Error, "Failed to inject the driver reconnect implementation", LOGGER);
                        }
                        LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection);
                        _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY;

                        // clean heartbeat failure
                        _heartbeatFailures = 0;
                    }
                }
            }
        }
Пример #7
0
        public void OnNext(Alarm value)
        {
            LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext(Alarm)");
            lock (this)
            {
                LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(Alarm)");

                if (_evaluatorSettings.OperationState == EvaluatorOperationState.OPERATIONAL && EvaluatorRuntime.State == State.RUNNING)
                {
                    EvaluatorHeartbeatProto evaluatorHeartbeatProto = GetEvaluatorHeartbeatProto();
                    LOGGER.Log(Level.Verbose, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}. {1}Node Health: {2}", evaluatorHeartbeatProto, Environment.NewLine, MachineStatus.ToString()));
                    Send(evaluatorHeartbeatProto);
                }
                else
                {
                    LOGGER.Log(Level.Verbose, "Ignoring regular heartbeat since Evaluator operation state is [{0}] and runtime state is [{1}]. ", EvaluatorSettings.OperationState, EvaluatorRuntime.State);

                    // Do not try to recover if Evaluator is done.
                    if (IsEvaluatorStateCompleted(EvaluatorRuntime.State))
                    {
                        return;
                    }

                    if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY)
                    {
                        var driverConnection = _driverConnection.Get();

                        try
                        {
                            var driverInformation = driverConnection.GetDriverInformation();
                            if (driverInformation == null)
                            {
                                LOGGER.Log(Level.Verbose, "In RECOVERY mode, cannot retrieve driver information, will try again later.");
                            }
                            else
                            {
                                LOGGER.Log(
                                    Level.Info,
                                    string.Format(CultureInfo.InvariantCulture, "Detect driver restarted at {0} and is running on endpoint {1} with services {2}. Now trying to re-establish connection", driverInformation.DriverStartTime, driverInformation.DriverRemoteIdentifier, driverInformation.NameServerId));
                                Recover(driverInformation);
                            }
                        }
                        catch (Exception e)
                        {
                            // we do not want any exception to stop the query for driver status
                            Utilities.Diagnostics.Exceptions.Caught(e, Level.Warning, LOGGER);
                        }
                    }
                }

                _clock.ScheduleAlarm(_heartBeatPeriodInMillSeconds, this);
            }
        }
Пример #8
0
 /// <summary>
 /// Called with a specific TaskStatus that must be delivered to the driver
 /// </summary>
 /// <param name="taskStatusProto"></param>
 public void OnNext(TaskStatusProto taskStatusProto)
 {
     LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext(TaskStatusProto)");
     lock (this)
     {
         LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(TaskStatusProto)");
         EvaluatorHeartbeatProto heartbeatProto = GetEvaluatorHeartbeatProto(
             EvaluatorRuntime.GetEvaluatorStatus(),
             ContextManager.GetContextStatusCollection(),
             Optional <TaskStatusProto> .Of(taskStatusProto));
         LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto));
         Send(heartbeatProto);
     }
 }
Пример #9
0
 /// <summary>
 /// Called with a specific EvaluatorStatus that must be delivered to the driver
 /// </summary>
 /// <param name="evaluatorStatusProto"></param>
 public void OnNext(EvaluatorStatusProto evaluatorStatusProto)
 {
     LOGGER.Log(Level.Verbose, "Before acquiring lock: HeartbeatManager::OnNext(EvaluatorStatusProto)");
     lock (this)
     {
         LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(EvaluatorStatusProto)");
         EvaluatorHeartbeatProto heartbeatProto = new EvaluatorHeartbeatProto()
         {
             timestamp        = CurrentTimeMilliSeconds(),
             evaluator_status = evaluatorStatusProto
         };
         LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto));
         Send(heartbeatProto);
     }
 }
Пример #10
0
 /// <summary>
 ///  Called with a specific ContextStatusProto that must be delivered to the driver
 /// </summary>
 /// <param name="contextStatusProto"></param>
 public void OnNext(ContextStatusProto contextStatusProto)
 {
     LOGGER.Log(Level.Verbose, "Before aqcuiring lock: HeartbeatManager::OnNext(ContextStatusProto)");
     lock (this)
     {
         LOGGER.Log(Level.Verbose, "HeartbeatManager::OnNext(ContextStatusProto)");
         List <ContextStatusProto> contextStatusProtos = new List <ContextStatusProto>();
         contextStatusProtos.Add(contextStatusProto);
         contextStatusProtos.AddRange(_contextManager.GetContextStatusCollection());
         EvaluatorHeartbeatProto heartbeatProto = GetEvaluatorHeartbeatProto(
             _evaluatorRuntime.GetEvaluatorStatus(),
             contextStatusProtos,
             Optional <TaskStatusProto> .Empty());
         LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Triggered a heartbeat: {0}.", heartbeatProto));
         Send(heartbeatProto);
     }
 }
Пример #11
0
        private EvaluatorHeartbeatProto GetEvaluatorHeartbeatProto(
            EvaluatorStatusProto evaluatorStatusProto,
            ICollection <ContextStatusProto> contextStatusProtos,
            Optional <TaskStatusProto> taskStatusProto)
        {
            EvaluatorHeartbeatProto evaluatorHeartbeatProto = new EvaluatorHeartbeatProto()
            {
                timestamp        = CurrentTimeMilliSeconds(),
                evaluator_status = evaluatorStatusProto
            };

            foreach (ContextStatusProto contextStatusProto in contextStatusProtos)
            {
                evaluatorHeartbeatProto.context_status.Add(contextStatusProto);
            }
            if (taskStatusProto.IsPresent())
            {
                evaluatorHeartbeatProto.task_status = taskStatusProto.Value;
            }
            return(evaluatorHeartbeatProto);
        }
Пример #12
0
        public void Handle(IRemoteMessage <EvaluatorHeartbeatProto> evaluatorHearBeatProtoMessage)
        {
            lock (_evaluatorDescriptor)
            {
                EvaluatorHeartbeatProto heartbeatProto = evaluatorHearBeatProtoMessage.Message;
                if (heartbeatProto.evaluator_status != null)
                {
                    EvaluatorStatusProto status = heartbeatProto.evaluator_status;
                    if (status.error != null)
                    {
                        Handle(new EvaluatorException(Id, ByteUtilities.ByteArrarysToString(status.error)));
                        return;
                    }
                    else if (_state == STATE.SUBMITTED)
                    {
                        string evaluatorRId = evaluatorHearBeatProtoMessage.Identifier.ToString();
                        LOGGER.Log(Level.Info, "TODO: REPLACE THIS " + evaluatorRId);
                        // TODO
                        // _evaluatorControlHandler = _remoteManager.getHandler(evaluatorRID, EvaluatorRuntimeProtocol.EvaluatorControlProto.class);
                        _state = STATE.RUNNING;
                        LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Evaluator {0} is running", _evaluatorId));
                    }
                }

                LOGGER.Log(Level.Info, "Evaluator heartbeat: " + heartbeatProto);

                EvaluatorStatusProto evaluatorStatusProto = heartbeatProto.evaluator_status;
                foreach (ContextStatusProto contextStatusProto in heartbeatProto.context_status)
                {
                    Handle(contextStatusProto, heartbeatProto.task_status != null);
                }

                if (heartbeatProto.task_status != null)
                {
                    Handle(heartbeatProto.task_status);
                }

                if (evaluatorStatusProto.state == State.FAILED)
                {
                    _state = STATE.FAILED;
                    EvaluatorException e = evaluatorStatusProto.error != null ?
                                           new EvaluatorException(_evaluatorId, ByteUtilities.ByteArrarysToString(evaluatorStatusProto.error)) :
                                           new EvaluatorException(_evaluatorId, "unknown cause");
                    LOGGER.Log(Level.Warning, "Failed evaluator: " + Id + e.Message);
                    Handle(e);
                }
                else if (evaluatorStatusProto.state == State.DONE)
                {
                    LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Evaluator {0} done", Id));
                    _state = STATE.DONE;

                    // TODO
                    // dispatcher.onNext(CompletedEvaluator.class, new CompletedEvaluator() {
                    //@Override
                    //public String getId() {
                    //  return EvaluatorManager.this.evaluatorId;
                    Dispose();
                }
            }
            LOGGER.Log(Level.Info, "DONE with evaluator heartbeat");
        }
Пример #13
0
 public REEFMessage(EvaluatorHeartbeatProto evaluatorHeartbeatProto)
 {
     _evaluatorHeartBeat = evaluatorHeartbeatProto;
 }