Ejemplo n.º 1
0
        private void Recover(DriverInformation driverInformation)
        {
            IPEndPoint driverEndpoint = NetUtilities.ParseIpEndpoint(driverInformation.DriverRemoteIdentifier);

            _remoteId = new SocketRemoteIdentifier(driverEndpoint);
            _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint <REEFMessage>(_remoteId));
            lock (_evaluatorSettings)
            {
                if (_evaluatorSettings.NameClient != null)
                {
                    try
                    {
                        LOGGER.Log(Level.Verbose, "Trying to reset and reconnect to name server" + driverInformation.NameServerId);
                        _evaluatorSettings.NameClient.Restart(NetUtilities.ParseIpEndpoint(driverInformation.NameServerId));
                        LOGGER.Log(Level.Info, "Reconnected to name server: " + driverInformation.NameServerId);
                    }
                    catch (Exception e)
                    {
                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Error, LOGGER);
                    }
                }
            }

            lock (_queuedHeartbeats)
            {
                bool firstHeartbeatInQueue = true;
                while (_queuedHeartbeats.Any())
                {
                    LOGGER.Log(Level.Info, "Sending cached recovery heartbeats to " + _remoteId);
                    try
                    {
                        if (firstHeartbeatInQueue)
                        {
                            // first heartbeat is specially construted to include the recovery flag
                            EvaluatorHeartbeatProto recoveryHeartbeat = ConstructRecoveryHeartBeat(_queuedHeartbeats.Dequeue());
                            LOGGER.Log(Level.Info, "Recovery heartbeat to be sent:" + recoveryHeartbeat);
                            _observer.OnNext(new REEFMessage(recoveryHeartbeat));
                            firstHeartbeatInQueue = false;
                        }
                        else
                        {
                            _observer.OnNext(new REEFMessage(_queuedHeartbeats.Dequeue()));
                        }
                    }
                    catch (Exception e)
                    {
                        // we do not handle failures during RECOVERY
                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow(
                            e,
                            Level.Error,
                            string.Format(CultureInfo.InvariantCulture, "Hearbeat attempt failed in RECOVERY mode to Driver {0} , giving up...", _remoteId),
                            LOGGER);
                    }
                    Thread.Sleep(500);
                }
            }
            _evaluatorSettings.OperationState = EvaluatorOperationState.OPERATIONAL;
            LOGGER.Log(Level.Info, "=========== Exiting RECOVERY mode. ===========");
        }
Ejemplo n.º 2
0
 public HeartBeatManager(EvaluatorSettings settings, IRemoteIdentifier remoteId)
 {
     using (LOGGER.LogFunction("HeartBeatManager::HeartBeatManager"))
     {
         _remoteManager = settings.RemoteManager;
         _remoteId      = remoteId;
         _evaluatorId   = settings.EvalutorId;
         _observer      = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint <REEFMessage>(_remoteId));
         _clock         = settings.RuntimeClock;
         _heartBeatPeriodInMillSeconds = settings.HeartBeatPeriodInMs;
         _maxHeartbeatRetries          = settings.MaxHeartbeatFailures;
         EvaluatorSettings             = settings;
         MachineStatus.ToString(); // kick start the CPU perf counter
     }
 }
Ejemplo n.º 3
0
 private HeartBeatManager(
     EvaluatorSettings settings,
     IInjectionFuture<EvaluatorRuntime> evaluatorRuntime,
     IInjectionFuture<ContextManager> contextManager,
     [Parameter(typeof(ErrorHandlerRid))] string errorHandlerRid)
 {
     using (LOGGER.LogFunction("HeartBeatManager::HeartBeatManager"))
     {
         _evaluatorSettings = settings;
         _evaluatorRuntime = evaluatorRuntime;
         _contextManager = contextManager;
         _remoteManager = settings.RemoteManager;
         _remoteId = new SocketRemoteIdentifier(NetUtilities.ParseIpEndpoint(errorHandlerRid));
         _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint<REEFMessage>(_remoteId));
         _clock = settings.RuntimeClock;
         _heartBeatPeriodInMillSeconds = settings.HeartBeatPeriodInMs;
         _maxHeartbeatRetries = settings.MaxHeartbeatRetries;
         MachineStatus.ToString(); // kick start the CPU perf counter
     }
 }
Ejemplo n.º 4
0
 private HeartBeatManager(
     EvaluatorSettings settings,
     IInjectionFuture <EvaluatorRuntime> evaluatorRuntime,
     IInjectionFuture <ContextManager> contextManager,
     [Parameter(typeof(ErrorHandlerRid))] string errorHandlerRid)
 {
     using (LOGGER.LogFunction("HeartBeatManager::HeartBeatManager"))
     {
         _evaluatorSettings            = settings;
         _evaluatorRuntime             = evaluatorRuntime;
         _contextManager               = contextManager;
         _remoteManager                = settings.RemoteManager;
         _remoteId                     = new SocketRemoteIdentifier(NetUtilities.ParseIpEndpoint(errorHandlerRid));
         _observer                     = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint <REEFMessage>(_remoteId));
         _clock                        = settings.RuntimeClock;
         _heartBeatPeriodInMillSeconds = settings.HeartBeatPeriodInMs;
         _maxHeartbeatRetries          = settings.MaxHeartbeatRetries;
         MachineStatus.ToString(); // kick start the CPU perf counter
     }
 }
Ejemplo n.º 5
0
 public DefaultRemoteMessage(IRemoteIdentifier id, T message)
 {
     Identifier = id;
     Message    = message;
 }
Ejemplo n.º 6
0
        private void Recover(DriverInformation driverInformation)
        {
            IPEndPoint driverEndpoint = NetUtilities.ParseIpEndpoint(driverInformation.DriverRemoteIdentifier);
            _remoteId = new SocketRemoteIdentifier(driverEndpoint);
            _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint<REEFMessage>(_remoteId));
            lock (_evaluatorSettings)
            {
                if (_evaluatorSettings.NameClient != null)
                {
                    try
                    {
                        LOGGER.Log(Level.Verbose, "Trying to reset and reconnect to name server" + driverInformation.NameServerId);
                        _evaluatorSettings.NameClient.Restart(NetUtilities.ParseIpEndpoint(driverInformation.NameServerId));
                        LOGGER.Log(Level.Info, "Reconnected to name server: " + driverInformation.NameServerId);
                    }
                    catch (Exception e)
                    {
                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Error, LOGGER);
                    }
                }
            }

            lock (_queuedHeartbeats)
            {
                bool firstHeartbeatInQueue = true;
                while (_queuedHeartbeats.Any())
                {
                    LOGGER.Log(Level.Info, "Sending cached recovery heartbeats to " + _remoteId);
                    try
                    {
                        if (firstHeartbeatInQueue)
                        {
                            // first heartbeat is specially construted to include the recovery flag
                            EvaluatorHeartbeatProto recoveryHeartbeat = ConstructRecoveryHeartBeat(_queuedHeartbeats.Dequeue());
                            LOGGER.Log(Level.Info, "Recovery heartbeat to be sent:" + recoveryHeartbeat);
                            _observer.OnNext(new REEFMessage(recoveryHeartbeat));
                            firstHeartbeatInQueue = false;
                        }
                        else
                        {
                            _observer.OnNext(new REEFMessage(_queuedHeartbeats.Dequeue()));
                        }
                    }
                    catch (Exception e)
                    {
                        // we do not handle failures during RECOVERY 
                        Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow(
                            e,
                            Level.Error,
                            string.Format(CultureInfo.InvariantCulture, "Hearbeat attempt failed in RECOVERY mode to Driver {0} , giving up...", _remoteId),
                            LOGGER);
                    }
                    Thread.Sleep(500);
                }
            }

            _evaluatorSettings.OperationState = EvaluatorOperationState.OPERATIONAL;
            ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Reconnected));

            LOGGER.Log(Level.Info, "=========== Exiting RECOVERY mode. ===========");
        }
Ejemplo n.º 7
0
        public void OnNext(IRemoteMessage <REEFMessage> value)
        {
            REEFMessage       remoteEvent = value.Message;
            IRemoteIdentifier id          = value.Identifier;

            LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "receive a ReefMessage from {0} Driver at {1}.", remoteEvent, id));

            if (remoteEvent.evaluatorControl != null)
            {
                if (remoteEvent.evaluatorControl.context_control != null)
                {
                    string context_message = null;
                    string task_message    = null;

                    if (remoteEvent.evaluatorControl.context_control.context_message != null)
                    {
                        context_message = remoteEvent.evaluatorControl.context_control.context_message.ToString();
                    }
                    if (remoteEvent.evaluatorControl.context_control.task_message != null)
                    {
                        task_message = ByteUtilities.ByteArrarysToString(remoteEvent.evaluatorControl.context_control.task_message);
                    }

                    if (!(string.IsNullOrEmpty(context_message) && string.IsNullOrEmpty(task_message)))
                    {
                        LOGGER.Log(Level.Info,
                                   string.Format(CultureInfo.InvariantCulture, "Control protobuf with context message [{0}] and task message [{1}]", context_message, task_message));
                    }
                    else if (remoteEvent.evaluatorControl.context_control.remove_context != null)
                    {
                        LOGGER.Log(Level.Info,
                                   string.Format(CultureInfo.InvariantCulture, "Control protobuf to remove context {0}", remoteEvent.evaluatorControl.context_control.remove_context.context_id));
                    }
                    else if (remoteEvent.evaluatorControl.context_control.add_context != null)
                    {
                        LOGGER.Log(Level.Info,
                                   string.Format(CultureInfo.InvariantCulture, "Control protobuf to add a context on top of {0}", remoteEvent.evaluatorControl.context_control.add_context.parent_context_id));
                    }
                    else if (remoteEvent.evaluatorControl.context_control.start_task != null)
                    {
                        LOGGER.Log(Level.Info,
                                   string.Format(CultureInfo.InvariantCulture, "Control protobuf to start an task in {0}", remoteEvent.evaluatorControl.context_control.start_task.context_id));
                    }
                    else if (remoteEvent.evaluatorControl.context_control.stop_task != null)
                    {
                        LOGGER.Log(Level.Info, "Control protobuf to stop task");
                    }
                    else if (remoteEvent.evaluatorControl.context_control.suspend_task != null)
                    {
                        LOGGER.Log(Level.Info, "Control protobuf to suspend task");
                    }
                }
            }
            if (_count == 0)
            {
                _begin     = DateTime.Now;
                _origBegin = _begin;
            }
            var count = Interlocked.Increment(ref _count);

            int printBatchSize = 100000;

            if (count % printBatchSize == 0)
            {
                DateTime end             = DateTime.Now;
                var      diff            = (end - _begin).TotalMilliseconds;
                double   seconds         = diff / 1000.0;
                long     eventsPerSecond = (long)(printBatchSize / seconds);
                _begin = DateTime.Now;
            }

            var observer = _observer;

            if (observer != null)
            {
                observer.OnNext(value);
            }
        }
Ejemplo n.º 8
0
 public RemoteEventEndPoint(IRemoteIdentifier id)
 {
     _id = id;
 }