static ICommand EnterCheckpointingHelper(EndpointExecutorFsm thisPtr) { ICommand next; // If there was a partial failure, try to resend the failed messages // Copy the initial send message command to keep the uncompleted TaskCompletionSource if (thisPtr.currentCheckpointCommand.Result.Failed.Count > 0) { TimeSpan retryAfter; // PartialFailures should always have an exception object filled out Exception innerException = thisPtr.currentCheckpointCommand.Result.SendFailureDetails.GetOrElse(DefaultSendFailureDetails).RawException; // 1. Change Messages to be sent to failedMessages // 2. Change the Command type to Fail or Dead depending on retry thisPtr.currentSendCommand = thisPtr.currentSendCommand.Copy(thisPtr.currentCheckpointCommand.Result.Failed); bool shouldRetry = thisPtr.ShouldRetry(innerException, out retryAfter); Events.CheckRetryInnerException(innerException, shouldRetry); if (shouldRetry) { next = Commands.Fail(retryAfter); } else { next = thisPtr.config.ThrowOnDead ? (ICommand)Commands.Throw(innerException) : Commands.Die; } } else { next = Commands.Succeed; } return(next); }
public static void RetryDelay(EndpointExecutorFsm fsm) { DateTime next = fsm.systemTime.UtcNow.SafeAdd(fsm.Status.RetryPeriod); Log.LogDebug((int)EventIds.RetryDelay, "[RetryDelay] Waiting to retry. Retry.Attempts: {0}, Retry.Period: {1}, Retry.Next: {2}, {3}", fsm.Status.RetryAttempts, fsm.Status.RetryPeriod.ToString(TimeSpanFormat), next.ToString(DateTimeFormat), GetContextString(fsm)); }
static async Task EnterDeadCheckpointingAsync(EndpointExecutorFsm thisPtr) { ICommand next; try { Preconditions.CheckNotNull(thisPtr.currentCheckpointCommand); using (var cts = new CancellationTokenSource(thisPtr.config.Timeout)) { ISinkResult <IMessage> result = thisPtr.currentCheckpointCommand.Result; Events.Checkpoint(thisPtr, result); await thisPtr.Checkpointer.CommitAsync(result.Succeeded, EmptyMessages, thisPtr.lastFailedRevivalTime, thisPtr.unhealthySince, cts.Token); Events.CheckpointSuccess(thisPtr, result); } next = Commands.DeadSucceed; Events.DeadSuccess(thisPtr, thisPtr.currentCheckpointCommand.Result.Succeeded); } catch (Exception ex) { Events.CheckpointFailure(thisPtr, ex); next = thisPtr.config.ThrowOnDead ? (ICommand)Commands.Throw(ex) : Commands.DeadSucceed; } await RunInternalAsync(thisPtr, next); }
static Task EnterFailingAsync(EndpointExecutorFsm thisPtr) { thisPtr.retryAttempts++; thisPtr.retryTimer.Change(thisPtr.retryPeriod, Timeout.InfiniteTimeSpan); Events.RetryDelay(thisPtr); return(TaskEx.Done); }
static async Task EnterProcessDeadAsync(EndpointExecutorFsm thisPtr) { Preconditions.CheckArgument(thisPtr.lastFailedRevivalTime.HasValue); TimeSpan deadFor = thisPtr.systemTime.UtcNow - thisPtr.lastFailedRevivalTime.GetOrElse(thisPtr.systemTime.UtcNow); if (deadFor >= thisPtr.config.RevivePeriod) { await RunInternalAsync(thisPtr, Commands.Revive); } else { try { // In the dead state, checkpoint all "sent" messages // This effectively drops the messages, allowing the other endpoints in // this router to make progress. ICollection <IMessage> tocheckpoint = thisPtr.currentSendCommand.Messages; Events.Dead(thisPtr, tocheckpoint); SendFailureDetails persistingFailureDetails = thisPtr.currentCheckpointCommand?.Result?.SendFailureDetails.GetOrElse(default(SendFailureDetails)); await RunInternalAsync(thisPtr, Commands.Checkpoint(new SinkResult <IMessage>(tocheckpoint, persistingFailureDetails))); } catch (Exception ex) { Events.DeadFailure(thisPtr, ex); } } }
static async Task RunInternalAsync(EndpointExecutorFsm thisPtr, ICommand command) { StateTransition transition; var pair = new StateCommandPair(thisPtr.state, command.Type); if (!Transitions.TryGetValue(pair, out transition)) { throw new InvalidOperationException($"Unknown state transition. In state.{thisPtr.state}, Got command.{command.Type}"); } StateActions currentActions = Actions[thisPtr.state]; StateActions nextActions = Actions[transition.NextState]; await currentActions.Exit(thisPtr); Events.StateExit(thisPtr); await transition.TransitionAction(thisPtr, command); Events.StateTransition(thisPtr, thisPtr.state, transition.NextState); thisPtr.state = transition.NextState; await nextActions.Enter(thisPtr); Events.StateEnter(thisPtr); }
static async Task UpdateEndpointAsync(EndpointExecutorFsm thisPtr, ICommand command) { Preconditions.CheckNotNull(thisPtr); Preconditions.CheckNotNull(command); Preconditions.CheckArgument(command is UpdateEndpoint); thisPtr.retryAttempts = 0; Events.UpdateEndpoint(thisPtr); try { var update = (UpdateEndpoint)command; await thisPtr.processor.CloseAsync(CancellationToken.None); thisPtr.processor = update.Endpoint.CreateProcessor(); Events.UpdateEndpointSuccess(thisPtr); } catch (Exception ex) { Events.UpdateEndpointFailure(thisPtr, ex); // TODO(manusr): If this throws, it will break the state machine throw; } }
static void SetProcessingInternalCounters(EndpointExecutorFsm fsm, string status, ICollection <IMessage> messages) { if (!messages.Any()) { return; } if (!Routing.PerfCounter.LogEventsProcessed(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, status, messages.Count, out string error)) { Log.LogError((int)EventIds.CounterFailure, "[LogEventsProcessedCounterFailed] {0}", error); } double totalTimeMSecs = messages.Select(m => m.DequeuedTime).Aggregate(0D, (span, time) => span + (fsm.systemTime.UtcNow - time).TotalMilliseconds); long averageLatencyInMs = totalTimeMSecs < 0 ? 0L : (long)(totalTimeMSecs / messages.Count); if (!Routing.PerfCounter.LogEventProcessingLatency(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, status, averageLatencyInMs, out error)) { Log.LogError((int)EventIds.CounterFailure, "[LogEventProcessingLatencyCounterFailed] {0}", error); } double messageE2EProcessingLatencyTotalMSecs = messages.Select(m => m.EnqueuedTime).Aggregate(0D, (span, time) => span + (fsm.systemTime.UtcNow - time).TotalMilliseconds); long averageE2ELatencyInMs = messageE2EProcessingLatencyTotalMSecs < 0 ? 0L : (long)(messageE2EProcessingLatencyTotalMSecs / messages.Count); if (!Routing.PerfCounter.LogE2EEventProcessingLatency(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, status, averageE2ELatencyInMs, out error)) { Log.LogError((int)EventIds.CounterFailure, "[LogE2EEventProcessingLatencyCounterFailed] {0}", error); } }
public static void SendFailure(EndpointExecutorFsm fsm, ISinkResult <IMessage> result, Stopwatch stopwatch) { long latencyInMs = stopwatch.ElapsedMilliseconds; if (!Routing.PerfCounter.LogExternalWriteLatency(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, false, latencyInMs, out string error)) { Log.LogError((int)EventIds.CounterFailure, "[LogExternalWriteLatencyCounterFailed] {0}", error); } SendFailureDetails failureDetails = result.SendFailureDetails.GetOrElse(DefaultSendFailureDetails); foreach (InvalidDetails <IMessage> invalidDetails in result.InvalidDetailsList) { Routing.UserAnalyticsLogger.LogInvalidMessage(fsm.Endpoint.IotHubName, invalidDetails.Item, invalidDetails.FailureKind); } Log.LogWarning( (int)EventIds.SendFailure, failureDetails.RawException, "[SendFailure] Sending failed. SuccessfulSize: {0}, FailedSize: {1}, InvalidSize: {2}, {3}", result.Succeeded.Count, result.Failed.Count, result.InvalidDetailsList.Count, GetContextString(fsm)); LogUnhealthyEndpointOpMonError(fsm, failureDetails.FailureKind); }
public static void DeadSuccess(EndpointExecutorFsm fsm, ICollection <IMessage> messages) { Preconditions.CheckArgument(fsm.Status.LastFailedRevivalTime.HasValue); CultureInfo culture = CultureInfo.InvariantCulture; DateTime reviveAt = fsm.Status.LastFailedRevivalTime.GetOrElse(fsm.systemTime.UtcNow).SafeAdd(fsm.config.RevivePeriod); Log.LogWarning( (int)EventIds.DeadSuccess, "[DeadSuccess] Dropped {0} messages. BatchSize: {1}, LastFailedRevivalTime: {2}, UnhealthySince: {3}, ReviveAt: {4}, {5}", messages.Count, messages.Count, fsm.Status.LastFailedRevivalTime.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture), fsm.Status.UnhealthySince.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture), reviveAt.ToString(DateTimeFormat, culture), GetContextString(fsm)); SetProcessingInternalCounters(fsm, "Dropped", messages); SetDroppedEgressUserMetricCounter(fsm, messages); FailureKind failureKind = fsm.currentCheckpointCommand?.Result?.SendFailureDetails.GetOrElse(DefaultFailureDetails).FailureKind ?? FailureKind.InternalError; foreach (IMessage message in messages) { Routing.UserAnalyticsLogger.LogDroppedMessage(fsm.Endpoint.IotHubName, message, fsm.Endpoint.Name, failureKind); } }
static Task ExitFailingAsync(EndpointExecutorFsm thisPtr) { thisPtr.retryTimer.Change(Timeout.InfiniteTimeSpan, Timeout.InfiniteTimeSpan); thisPtr.retryPeriod = Timeout.InfiniteTimeSpan; Events.Retry(thisPtr); return(TaskEx.Done); }
static void LogUnhealthyEndpointOpMonError(EndpointExecutorFsm fsm, FailureKind failureKind) { if (!fsm.lastFailedRevivalTime.HasValue && fsm.unhealthySince.GetOrElse(DateTime.MaxValue) < fsm.systemTime.UtcNow.Subtract(LogUserAnalyticsErrorOnUnhealthySince)) { Routing.UserAnalyticsLogger.LogUnhealthyEndpoint(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, failureKind); } }
static void SetDroppedEgressUserMetricCounter(EndpointExecutorFsm fsm, IEnumerable <IMessage> messages) { foreach (IGrouping <Type, IMessage> group in messages.GroupBy(m => m.MessageSource.GetType()).Where(g => g.Any())) { int count = group.Count(); Routing.UserMetricLogger.LogEgressMetric(count, fsm.Endpoint.IotHubName, MessageRoutingStatus.Dropped, count > 0 ? group.First().ToString() : group.Key.Name); } }
public static void Dead(EndpointExecutorFsm fsm, ICollection <IMessage> messages) { Preconditions.CheckArgument(fsm.Status.LastFailedRevivalTime.HasValue); DateTime reviveAt = fsm.Status.LastFailedRevivalTime.GetOrElse(fsm.systemTime.UtcNow).SafeAdd(fsm.config.RevivePeriod); Log.LogWarning((int)EventIds.Dead, "[Dead] Dropping {0} messages. BatchSize: {1}, LastFailedRevivalTime: {2}, UnhealthySince: {3}, ReviveAt: {4}, {5}", messages.Count, messages.Count, fsm.Status.LastFailedRevivalTime.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat), fsm.Status.UnhealthySince.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat), reviveAt.ToString(DateTimeFormat), GetContextString(fsm)); }
public static void CheckpointFailure(EndpointExecutorFsm fsm, Exception ex) { Log.LogError( (int)EventIds.CheckpointFailure, ex, "[CheckpointFailure] Checkpointing failed. CheckpointOffset: {0}, {1}", fsm.Status.CheckpointerStatus.Offset, GetContextString(fsm)); }
static Task PrepareForReviveAsync(EndpointExecutorFsm thisPtr, ICommand command) { Preconditions.CheckNotNull(thisPtr); Preconditions.CheckNotNull(command); Preconditions.CheckArgument(command is Revive); Events.PrepareForRevive(thisPtr); return(TaskEx.Done); }
static Task ExitDeadCheckpointingAsync(EndpointExecutorFsm thisPtr) { Preconditions.CheckNotNull(thisPtr); thisPtr.currentSendCommand.Complete(); thisPtr.currentSendCommand = null; thisPtr.currentCheckpointCommand = null; return(TaskEx.Done); }
public static void Checkpoint(EndpointExecutorFsm fsm, ISinkResult <IMessage> result) { Log.LogDebug( (int)EventIds.Checkpoint, "[Checkpoint] Checkpointing began. CheckpointOffset: {0}, SuccessfulSize: {1}, RemainingSize: {2}, {3}", fsm.Status.CheckpointerStatus.Offset, result.Succeeded.Count + result.InvalidDetailsList.Count, result.Failed.Count, GetContextString(fsm)); }
public static void Send(EndpointExecutorFsm fsm, ICollection <IMessage> messages, ICollection <IMessage> admitted) { Log.LogDebug( (int)EventIds.Send, "[Send Sending began. BatchSize: {0}, AdmittedSize: {1}, MaxAdmittedOffset: {2}, {3}", messages.Count, admitted.Count, admitted.Max(m => m.Offset), GetContextString(fsm)); }
static string GetContextString(EndpointExecutorFsm fsm) { return(string.Format( CultureInfo.InvariantCulture, "EndpointId: {0}, EndpointName: {1}, CheckpointerId: {2}, State: {3}", fsm.Status.Id, fsm.Endpoint.Name, fsm.Status.CheckpointerStatus.Id, fsm.state)); }
static Task DieAsync(EndpointExecutorFsm thisPtr, ICommand command) { Preconditions.CheckNotNull(thisPtr); Preconditions.CheckNotNull(command); Preconditions.CheckArgument(command is Die); thisPtr.lastFailedRevivalTime = Option.Some(thisPtr.systemTime.UtcNow); Events.Die(thisPtr); return(TaskEx.Done); }
static Task PrepareForCheckpointAsync(EndpointExecutorFsm thisPtr, ICommand command) { Preconditions.CheckNotNull(thisPtr); Preconditions.CheckNotNull(command); Preconditions.CheckArgument(command is Checkpoint); var checkpoint = (Checkpoint)command; thisPtr.currentCheckpointCommand = checkpoint; return(TaskEx.Done); }
static Task SucceedCompleteAsync(EndpointExecutorFsm thisPtr, ICommand command) { Preconditions.CheckNotNull(thisPtr); Preconditions.CheckNotNull(command); Preconditions.CheckArgument(command is Succeed); thisPtr.currentSendCommand.Complete(); thisPtr.currentSendCommand = null; thisPtr.currentCheckpointCommand = null; return(TaskEx.Done); }
static Task SendMessageClosedAsync(EndpointExecutorFsm thisPtr, ICommand command) { Preconditions.CheckNotNull(thisPtr); Preconditions.CheckNotNull(command); Preconditions.CheckArgument(command is SendMessage); var send = (SendMessage)command; send.Complete(); return(TaskEx.Done); }
static Task FailAsync(EndpointExecutorFsm thisPtr, ICommand command) { Preconditions.CheckNotNull(thisPtr); Preconditions.CheckNotNull(command); Preconditions.CheckArgument(command is Fail); var fail = (Fail)command; thisPtr.retryPeriod = fail.RetryAfter; return(TaskEx.Done); }
public static void PrepareForRevive(EndpointExecutorFsm fsm) { Preconditions.CheckArgument(fsm.Status.LastFailedRevivalTime.HasValue); CultureInfo culture = CultureInfo.InvariantCulture; DateTime reviveAt = fsm.Status.LastFailedRevivalTime.GetOrElse(fsm.systemTime.UtcNow).SafeAdd(fsm.config.RevivePeriod); Log.LogInformation((int)EventIds.PrepareForRevive, "[PrepareForRevive] Attempting to bring endpoint back. LastFailedRevivalTime: {0}, UnhealthySince: {1}, ReviveAt: {2}, {3}", fsm.Status.LastFailedRevivalTime.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture), fsm.Status.UnhealthySince.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture), reviveAt.ToString(DateTimeFormat, culture), GetContextString(fsm)); }
static Task PrepareForSendAsync(EndpointExecutorFsm thisPtr, ICommand command) { Preconditions.CheckNotNull(thisPtr); Preconditions.CheckNotNull(command); Preconditions.CheckArgument(command is SendMessage); var send = (SendMessage)command; thisPtr.currentSendCommand = send; return(TaskEx.Done); }
static void SetInvalidEgressUserMetricCounter(EndpointExecutorFsm fsm, IEnumerable <IMessage> messages) { foreach (IGrouping <Type, IMessage> group in messages.GroupBy(m => m.MessageSource.GetType()).Where(g => g.Any())) { int count = group.Count(); if (count > 0) { Routing.UserMetricLogger.LogEgressMetric(count, fsm.Endpoint.IotHubName, MessageRoutingStatus.Invalid, group.First()); } } }
public static void DeadFailure(EndpointExecutorFsm fsm, Exception ex) { Preconditions.CheckArgument(fsm.Status.LastFailedRevivalTime.HasValue); CultureInfo culture = CultureInfo.InvariantCulture; DateTime reviveAt = fsm.Status.LastFailedRevivalTime.GetOrElse(fsm.systemTime.UtcNow).SafeAdd(fsm.config.RevivePeriod); Log.LogError((int)EventIds.DeadFailure, ex, "[DeadFailure] Dropping messages failed. LastFailedRevivalTime: {0}, UnhealthySince: {1}, DeadTime:{2}, ReviveAt: {3}, {4}", fsm.Status.LastFailedRevivalTime.ToString(), fsm.Status.LastFailedRevivalTime.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture), fsm.Status.UnhealthySince.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture), reviveAt.ToString(DateTimeFormat, culture), GetContextString(fsm)); }
public static void SendFailureUnhandledException(EndpointExecutorFsm fsm, ICollection <IMessage> messages, Stopwatch stopwatch, Exception exception) { long latencyInMs = stopwatch.ElapsedMilliseconds; if (!Routing.PerfCounter.LogExternalWriteLatency(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, false, latencyInMs, out string error)) { Log.LogError((int)EventIds.CounterFailure, "[LogExternalWriteLatencyCounterFailed] {0}", error); } Log.LogError((int)EventIds.SendFailureUnhandledException, exception, "[SendFailureUnhandledException] Unhandled exception. FailedSize: {0}, {1}", messages.Count, GetContextString(fsm)); LogUnhealthyEndpointOpMonError(fsm, FailureKind.InternalError); }