public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { lock (_queuedHeartbeats) { if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto)); _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); return; } // NOT during recovery, try to send REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto); try { _observer.OnNext(payload); _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures } catch (Exception e) { if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING) { Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER); } _heartbeatFailures++; _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e); if (_heartbeatFailures >= _maxHeartbeatRetries) { LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures); LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ==========="); ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected)); try { _driverConnection = _evaluatorSettings.EvaluatorInjector.GetInstance <IDriverConnection>(); } catch (Exception ex) { Utilities.Diagnostics.Exceptions.CaughtAndThrow(ex, Level.Error, "Failed to inject the driver reconnect implementation", LOGGER); } LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection); _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY; // clean heartbeat failure _heartbeatFailures = 0; } } } }
private static IDriverReconnConfigProvider GetDriverReconnectionProvider( IDriverReconnConfigProvider driverReconnConfigProvider, IDriverConnection driverConnection) { // If not the default, this means that the user has bound the newer configuration. Return it. if (!(driverReconnConfigProvider is DefaultDriverReconnConfigProvider)) { return(driverReconnConfigProvider); } // If not default, this means that the user has bound the old configuration. // Use the dynamic configuration provider in that case. if (!(driverConnection is MissingDriverConnection)) { return(new DynamicDriverReconnConfigProvider(driverConnection.GetType())); } // This is done as a stop gap for deprecation because we cannot bind an implementation // of IDriverConnection to the driver CLRBridgeConfiguration if it is already bound // by the user, since the driver configuration and Evaluator configuration will be combined // at the Evaluator. We thus need to return the DriverReconnectionConfigurationProvider // that does not bind IDriverConnection such that a TANG conflict does not occur. return(TangFactory.GetTang().NewInjector().GetInstance <DefaultDriverReconnConfigProvider>()); }
public void Send(EvaluatorHeartbeatProto evaluatorHeartbeatProto) { lock (_queuedHeartbeats) { if (_evaluatorSettings.OperationState == EvaluatorOperationState.RECOVERY) { LOGGER.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "In RECOVERY mode, heartbeat queued as [{0}]. ", evaluatorHeartbeatProto)); _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); return; } // NOT during recovery, try to send REEFMessage payload = new REEFMessage(evaluatorHeartbeatProto); try { _observer.OnNext(payload); _heartbeatFailures = 0; // reset failure counts if we are having intermidtten (not continuous) failures } catch (Exception e) { if (evaluatorHeartbeatProto.task_status == null || evaluatorHeartbeatProto.task_status.state != State.RUNNING) { Utilities.Diagnostics.Exceptions.Throw(e, "Lost communications to driver when no task is running, recovery NOT supported for such scenario", LOGGER); } _heartbeatFailures++; _queuedHeartbeats.Enqueue(evaluatorHeartbeatProto); LOGGER.Log(Level.Error, string.Format(CultureInfo.InvariantCulture, "Sending heartbeat to driver experienced #{0} failure. Hearbeat queued as: [{1}]. ", _heartbeatFailures, evaluatorHeartbeatProto), e); if (_heartbeatFailures >= _maxHeartbeatRetries) { LOGGER.Log(Level.Warning, "Heartbeat communications to driver reached max of {0} failures. Driver is considered dead/unreachable", _heartbeatFailures); LOGGER.Log(Level.Info, "=========== Entering RECOVERY mode. ==========="); ContextManager.HandleDriverConnectionMessage(new DriverConnectionMessageImpl(DriverConnectionState.Disconnected)); try { _driverConnection = _evaluatorSettings.EvaluatorInjector.GetInstance<IDriverConnection>(); } catch (Exception ex) { Utilities.Diagnostics.Exceptions.CaughtAndThrow(ex, Level.Error, "Failed to inject the driver reconnect implementation", LOGGER); } LOGGER.Log(Level.Info, "instantiate driver reconnect implementation: " + _driverConnection); _evaluatorSettings.OperationState = EvaluatorOperationState.RECOVERY; // clean heartbeat failure _heartbeatFailures = 0; } } } }
public RemoteLearnMode(IDriverConnection connection) { _connection = connection; }
public DriverBridge( [Parameter(Value = typeof(DriverBridgeConfigurationOptions.DriverStartedHandlers))] ISet <IObserver <IDriverStarted> > driverStartHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.DriverRestartedHandlers))] ISet <IObserver <IDriverRestarted> > driverRestartedHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.EvaluatorRequestHandlers))] ISet <IObserver <IEvaluatorRequestor> > evaluatorRequestHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.AllocatedEvaluatorHandlers))] ISet <IObserver <IAllocatedEvaluator> > allocatedEvaluatorHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.ActiveContextHandlers))] ISet <IObserver <IActiveContext> > activeContextHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.TaskMessageHandlers))] ISet <IObserver <ITaskMessage> > taskMessageHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.FailedTaskHandlers))] ISet <IObserver <IFailedTask> > failedTaskHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.FailedEvaluatorHandlers))] ISet <IObserver <IFailedEvaluator> > failedEvaluatorHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.CompletedEvaluatorHandlers))] ISet <IObserver <ICompletedEvaluator> > completedEvaluatorHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.RunningTaskHandlers))] ISet <IObserver <IRunningTask> > runningTaskHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.CompletedTaskHandlers))] ISet <IObserver <ICompletedTask> > completedTaskHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.SuspendedTaskHandlers))] ISet <IObserver <ISuspendedTask> > suspendedTaskHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.ClosedContextHandlers))] ISet <IObserver <IClosedContext> > closedContextHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.FailedContextHandlers))] ISet <IObserver <IFailedContext> > failedContextHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.ContextMessageHandlers))] ISet <IObserver <IContextMessage> > contextMessageHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.DriverRestartActiveContextHandlers))] ISet <IObserver <IActiveContext> > driverRestartActiveContextHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.DriverRestartRunningTaskHandlers))] ISet <IObserver <IRunningTask> > driverRestartRunningTaskHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.DriverRestartCompletedHandlers))] ISet <IObserver <IDriverRestartCompleted> > driverRestartCompletedHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.DriverRestartFailedEvaluatorHandlers))] ISet <IObserver <IFailedEvaluator> > driverRestartFailedEvaluatorHandlers, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.TraceListenersSet))] ISet <TraceListener> traceListeners, [Parameter(Value = typeof(EvaluatorConfigurationProviders))] ISet <IConfigurationProvider> configurationProviders, [Parameter(Value = typeof(DriverBridgeConfigurationOptions.TraceLevel))] string traceLevel, IDriverReconnConfigProvider driverReconnConfigProvider, IDriverConnection driverConnection, HttpServerHandler httpServerHandler, IProgressProvider progressProvider, AvroConfigurationSerializer serializer) { foreach (TraceListener listener in traceListeners) { Logger.AddTraceListener(listener); } _logger = Logger.GetLogger(typeof(DriverBridge)); _logger.Log(Level.Info, "Constructing DriverBridge"); Level level; if (!Enum.TryParse(traceLevel.ToString(CultureInfo.InvariantCulture), out level)) { _logger.Log(Level.Warning, string.Format(CultureInfo.InvariantCulture, "Invalid trace level {0} provided, will by default use verbose level", traceLevel)); } else { Logger.SetCustomLevel(level); } _driverStartHandlers = driverStartHandlers; _evaluatorRequestHandlers = evaluatorRequestHandlers; _allocatedEvaluatorHandlers = allocatedEvaluatorHandlers; _activeContextHandlers = activeContextHandlers; _taskMessageHandlers = taskMessageHandlers; _failedEvaluatorHandlers = failedEvaluatorHandlers; _failedTaskHandlers = failedTaskHandlers; _completedTaskHandlers = completedTaskHandlers; _runningTaskHandlers = runningTaskHandlers; _suspendedTaskHandlers = suspendedTaskHandlers; _completedEvaluatorHandlers = completedEvaluatorHandlers; _closedContextHandlers = closedContextHandlers; _failedContextHandlers = failedContextHandlers; _contextMessageHandlers = contextMessageHandlers; _driverRestartedHandlers = driverRestartedHandlers; _driverRestartActiveContextHandlers = driverRestartActiveContextHandlers; _driverRestartRunningTaskHandlers = driverRestartRunningTaskHandlers; _driverRestartCompletedHandlers = driverRestartCompletedHandlers; _driverRestartFailedEvaluatorHandlers = driverRestartFailedEvaluatorHandlers; _httpServerHandler = httpServerHandler; var configurationProviderSet = new HashSet <IConfigurationProvider>(configurationProviders) { driverReconnConfigProvider }; _configurationProviderString = serializer.ToString(Configurations.Merge(configurationProviderSet.Select(x => x.GetConfiguration()).ToArray())); _progressProvider = progressProvider; _allocatedEvaluatorSubscriber = new ClrSystemHandler <IAllocatedEvaluator>(); _completedEvaluatorSubscriber = new ClrSystemHandler <ICompletedEvaluator>(); _taskMessageSubscriber = new ClrSystemHandler <ITaskMessage>(); _activeContextSubscriber = new ClrSystemHandler <IActiveContext>(); _failedTaskSubscriber = new ClrSystemHandler <IFailedTask>(); _failedEvaluatorSubscriber = new ClrSystemHandler <IFailedEvaluator>(); _httpServerEventSubscriber = new ClrSystemHandler <IHttpMessage>(); _completedTaskSubscriber = new ClrSystemHandler <ICompletedTask>(); _runningTaskSubscriber = new ClrSystemHandler <IRunningTask>(); _suspendedTaskSubscriber = new ClrSystemHandler <ISuspendedTask>(); _closedContextSubscriber = new ClrSystemHandler <IClosedContext>(); _failedContextSubscriber = new ClrSystemHandler <IFailedContext>(); _contextMessageSubscriber = new ClrSystemHandler <IContextMessage>(); _driverRestartedSubscriber = new ClrSystemHandler <IDriverRestarted>(); _driverRestartActiveContextSubscriber = new ClrSystemHandler <IActiveContext>(); _driverRestartRunningTaskSubscriber = new ClrSystemHandler <IRunningTask>(); _driverRestartCompletedSubscriber = new ClrSystemHandler <IDriverRestartCompleted>(); _driverRestartFailedEvaluatorSubscriber = new ClrSystemHandler <IFailedEvaluator>(); }