private void Recover(DriverInformation driverInformation) { IPEndPoint driverEndpoint = NetUtilities.ParseIpEndpoint(driverInformation.DriverRemoteIdentifier); _remoteId = new SocketRemoteIdentifier(driverEndpoint); _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint <REEFMessage>(_remoteId)); lock (_evaluatorSettings) { if (_evaluatorSettings.NameClient != null) { try { LOGGER.Log(Level.Verbose, "Trying to reset and reconnect to name server" + driverInformation.NameServerId); _evaluatorSettings.NameClient.Restart(NetUtilities.ParseIpEndpoint(driverInformation.NameServerId)); LOGGER.Log(Level.Info, "Reconnected to name server: " + driverInformation.NameServerId); } catch (Exception e) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Error, LOGGER); } } } lock (_queuedHeartbeats) { bool firstHeartbeatInQueue = true; while (_queuedHeartbeats.Any()) { LOGGER.Log(Level.Info, "Sending cached recovery heartbeats to " + _remoteId); try { if (firstHeartbeatInQueue) { // first heartbeat is specially construted to include the recovery flag EvaluatorHeartbeatProto recoveryHeartbeat = ConstructRecoveryHeartBeat(_queuedHeartbeats.Dequeue()); LOGGER.Log(Level.Info, "Recovery heartbeat to be sent:" + recoveryHeartbeat); _observer.OnNext(new REEFMessage(recoveryHeartbeat)); firstHeartbeatInQueue = false; } else { _observer.OnNext(new REEFMessage(_queuedHeartbeats.Dequeue())); } } catch (Exception e) { // we do not handle failures during RECOVERY Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow( e, Level.Error, string.Format(CultureInfo.InvariantCulture, "Hearbeat attempt failed in RECOVERY mode to Driver {0} , giving up...", _remoteId), LOGGER); } Thread.Sleep(500); } } _evaluatorSettings.OperationState = EvaluatorOperationState.OPERATIONAL; LOGGER.Log(Level.Info, "=========== Exiting RECOVERY mode. ==========="); }
public HeartBeatManager(EvaluatorSettings settings, IRemoteIdentifier remoteId) { using (LOGGER.LogFunction("HeartBeatManager::HeartBeatManager")) { _remoteManager = settings.RemoteManager; _remoteId = remoteId; _evaluatorId = settings.EvalutorId; _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint <REEFMessage>(_remoteId)); _clock = settings.RuntimeClock; _heartBeatPeriodInMillSeconds = settings.HeartBeatPeriodInMs; _maxHeartbeatRetries = settings.MaxHeartbeatFailures; EvaluatorSettings = settings; MachineStatus.ToString(); // kick start the CPU perf counter } }
private HeartBeatManager( EvaluatorSettings settings, IInjectionFuture<EvaluatorRuntime> evaluatorRuntime, IInjectionFuture<ContextManager> contextManager, [Parameter(typeof(ErrorHandlerRid))] string errorHandlerRid) { using (LOGGER.LogFunction("HeartBeatManager::HeartBeatManager")) { _evaluatorSettings = settings; _evaluatorRuntime = evaluatorRuntime; _contextManager = contextManager; _remoteManager = settings.RemoteManager; _remoteId = new SocketRemoteIdentifier(NetUtilities.ParseIpEndpoint(errorHandlerRid)); _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint<REEFMessage>(_remoteId)); _clock = settings.RuntimeClock; _heartBeatPeriodInMillSeconds = settings.HeartBeatPeriodInMs; _maxHeartbeatRetries = settings.MaxHeartbeatRetries; MachineStatus.ToString(); // kick start the CPU perf counter } }
private HeartBeatManager( EvaluatorSettings settings, IInjectionFuture <EvaluatorRuntime> evaluatorRuntime, IInjectionFuture <ContextManager> contextManager, [Parameter(typeof(ErrorHandlerRid))] string errorHandlerRid) { using (LOGGER.LogFunction("HeartBeatManager::HeartBeatManager")) { _evaluatorSettings = settings; _evaluatorRuntime = evaluatorRuntime; _contextManager = contextManager; _remoteManager = settings.RemoteManager; _remoteId = new SocketRemoteIdentifier(NetUtilities.ParseIpEndpoint(errorHandlerRid)); _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint <REEFMessage>(_remoteId)); _clock = settings.RuntimeClock; _heartBeatPeriodInMillSeconds = settings.HeartBeatPeriodInMs; _maxHeartbeatRetries = settings.MaxHeartbeatRetries; MachineStatus.ToString(); // kick start the CPU perf counter } }
public DefaultRemoteMessage(IRemoteIdentifier id, T message) { Identifier = id; Message = message; }
private void Recover(DriverInformation driverInformation) { IPEndPoint driverEndpoint = NetUtilities.ParseIpEndpoint(driverInformation.DriverRemoteIdentifier); _remoteId = new SocketRemoteIdentifier(driverEndpoint); _observer = _remoteManager.GetRemoteObserver(new RemoteEventEndPoint<REEFMessage>(_remoteId)); lock (_evaluatorSettings) { if (_evaluatorSettings.NameClient != null) { try { LOGGER.Log(Level.Verbose, "Trying to reset and reconnect to name server" + driverInformation.NameServerId); _evaluatorSettings.NameClient.Restart(NetUtilities.ParseIpEndpoint(driverInformation.NameServerId)); LOGGER.Log(Level.Info, "Reconnected to name server: " + driverInformation.NameServerId); } catch (Exception e) { Org.Apache.REEF.Utilities.Diagnostics.Exceptions.Caught(e, Level.Error, LOGGER); } } } lock (_queuedHeartbeats) { bool firstHeartbeatInQueue = true; while (_queuedHeartbeats.Any()) { LOGGER.Log(Level.Info, "Sending cached recovery heartbeats to " + _remoteId); try { if (firstHeartbeatInQueue) { // first heartbeat is specially construted to include the recovery flag EvaluatorHeartbeatProto recoveryHeartbeat = ConstructRecoveryHeartBeat(_queuedHeartbeats.Dequeue()); LOGGER.Log(Level.Info, "Recovery heartbeat to be sent:" + recoveryHeartbeat); _observer.OnNext(new REEFMessage(recoveryHeartbeat)); firstHeartbeatInQueue = false; } else { _observer.OnNext(new REEFMessage(_queuedHeartbeats.Dequeue())); } } catch (Exception e) { // we do not handle failures during RECOVERY Org.Apache.REEF.Utilities.Diagnostics.Exceptions.CaughtAndThrow( e, Level.Error, string.Format(CultureInfo.InvariantCulture, "Hearbeat attempt failed in RECOVERY mode to Driver {0} , giving up...", _remoteId), LOGGER); } Thread.Sleep(500); } } _evaluatorSettings.OperationState = EvaluatorOperationState.OPERATIONAL; ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Reconnected)); LOGGER.Log(Level.Info, "=========== Exiting RECOVERY mode. ==========="); }
public void OnNext(IRemoteMessage <REEFMessage> value) { REEFMessage remoteEvent = value.Message; IRemoteIdentifier id = value.Identifier; LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "receive a ReefMessage from {0} Driver at {1}.", remoteEvent, id)); if (remoteEvent.evaluatorControl != null) { if (remoteEvent.evaluatorControl.context_control != null) { string context_message = null; string task_message = null; if (remoteEvent.evaluatorControl.context_control.context_message != null) { context_message = remoteEvent.evaluatorControl.context_control.context_message.ToString(); } if (remoteEvent.evaluatorControl.context_control.task_message != null) { task_message = ByteUtilities.ByteArrarysToString(remoteEvent.evaluatorControl.context_control.task_message); } if (!(string.IsNullOrEmpty(context_message) && string.IsNullOrEmpty(task_message))) { LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Control protobuf with context message [{0}] and task message [{1}]", context_message, task_message)); } else if (remoteEvent.evaluatorControl.context_control.remove_context != null) { LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Control protobuf to remove context {0}", remoteEvent.evaluatorControl.context_control.remove_context.context_id)); } else if (remoteEvent.evaluatorControl.context_control.add_context != null) { LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Control protobuf to add a context on top of {0}", remoteEvent.evaluatorControl.context_control.add_context.parent_context_id)); } else if (remoteEvent.evaluatorControl.context_control.start_task != null) { LOGGER.Log(Level.Info, string.Format(CultureInfo.InvariantCulture, "Control protobuf to start an task in {0}", remoteEvent.evaluatorControl.context_control.start_task.context_id)); } else if (remoteEvent.evaluatorControl.context_control.stop_task != null) { LOGGER.Log(Level.Info, "Control protobuf to stop task"); } else if (remoteEvent.evaluatorControl.context_control.suspend_task != null) { LOGGER.Log(Level.Info, "Control protobuf to suspend task"); } } } if (_count == 0) { _begin = DateTime.Now; _origBegin = _begin; } var count = Interlocked.Increment(ref _count); int printBatchSize = 100000; if (count % printBatchSize == 0) { DateTime end = DateTime.Now; var diff = (end - _begin).TotalMilliseconds; double seconds = diff / 1000.0; long eventsPerSecond = (long)(printBatchSize / seconds); _begin = DateTime.Now; } var observer = _observer; if (observer != null) { observer.OnNext(value); } }
public RemoteEventEndPoint(IRemoteIdentifier id) { _id = id; }