Example #1
0
        static ICommand EnterCheckpointingHelper(EndpointExecutorFsm thisPtr)
        {
            ICommand next;

            // If there was a partial failure, try to resend the failed messages
            // Copy the initial send message command to keep the uncompleted TaskCompletionSource
            if (thisPtr.currentCheckpointCommand.Result.Failed.Count > 0)
            {
                TimeSpan retryAfter;
                // PartialFailures should always have an exception object filled out
                Exception innerException = thisPtr.currentCheckpointCommand.Result.SendFailureDetails.GetOrElse(DefaultSendFailureDetails).RawException;

                // 1. Change Messages to be sent to failedMessages
                // 2. Change the Command type to Fail or Dead depending on retry
                thisPtr.currentSendCommand = thisPtr.currentSendCommand.Copy(thisPtr.currentCheckpointCommand.Result.Failed);

                bool shouldRetry = thisPtr.ShouldRetry(innerException, out retryAfter);
                Events.CheckRetryInnerException(innerException, shouldRetry);
                if (shouldRetry)
                {
                    next = Commands.Fail(retryAfter);
                }
                else
                {
                    next = thisPtr.config.ThrowOnDead ? (ICommand)Commands.Throw(innerException) : Commands.Die;
                }
            }
            else
            {
                next = Commands.Succeed;
            }

            return(next);
        }
Example #2
0
            public static void RetryDelay(EndpointExecutorFsm fsm)
            {
                DateTime next = fsm.systemTime.UtcNow.SafeAdd(fsm.Status.RetryPeriod);

                Log.LogDebug((int)EventIds.RetryDelay, "[RetryDelay] Waiting to retry. Retry.Attempts: {0}, Retry.Period: {1}, Retry.Next: {2}, {3}",
                             fsm.Status.RetryAttempts, fsm.Status.RetryPeriod.ToString(TimeSpanFormat), next.ToString(DateTimeFormat), GetContextString(fsm));
            }
Example #3
0
        static async Task EnterDeadCheckpointingAsync(EndpointExecutorFsm thisPtr)
        {
            ICommand next;

            try
            {
                Preconditions.CheckNotNull(thisPtr.currentCheckpointCommand);
                using (var cts = new CancellationTokenSource(thisPtr.config.Timeout))
                {
                    ISinkResult <IMessage> result = thisPtr.currentCheckpointCommand.Result;
                    Events.Checkpoint(thisPtr, result);
                    await thisPtr.Checkpointer.CommitAsync(result.Succeeded, EmptyMessages, thisPtr.lastFailedRevivalTime, thisPtr.unhealthySince, cts.Token);

                    Events.CheckpointSuccess(thisPtr, result);
                }

                next = Commands.DeadSucceed;
                Events.DeadSuccess(thisPtr, thisPtr.currentCheckpointCommand.Result.Succeeded);
            }
            catch (Exception ex)
            {
                Events.CheckpointFailure(thisPtr, ex);
                next = thisPtr.config.ThrowOnDead
                    ? (ICommand)Commands.Throw(ex)
                    : Commands.DeadSucceed;
            }

            await RunInternalAsync(thisPtr, next);
        }
Example #4
0
 static Task EnterFailingAsync(EndpointExecutorFsm thisPtr)
 {
     thisPtr.retryAttempts++;
     thisPtr.retryTimer.Change(thisPtr.retryPeriod, Timeout.InfiniteTimeSpan);
     Events.RetryDelay(thisPtr);
     return(TaskEx.Done);
 }
Example #5
0
        static async Task EnterProcessDeadAsync(EndpointExecutorFsm thisPtr)
        {
            Preconditions.CheckArgument(thisPtr.lastFailedRevivalTime.HasValue);

            TimeSpan deadFor = thisPtr.systemTime.UtcNow - thisPtr.lastFailedRevivalTime.GetOrElse(thisPtr.systemTime.UtcNow);

            if (deadFor >= thisPtr.config.RevivePeriod)
            {
                await RunInternalAsync(thisPtr, Commands.Revive);
            }
            else
            {
                try
                {
                    // In the dead state, checkpoint all "sent" messages
                    // This effectively drops the messages, allowing the other endpoints in
                    // this router to make progress.
                    ICollection <IMessage> tocheckpoint = thisPtr.currentSendCommand.Messages;
                    Events.Dead(thisPtr, tocheckpoint);
                    SendFailureDetails persistingFailureDetails = thisPtr.currentCheckpointCommand?.Result?.SendFailureDetails.GetOrElse(default(SendFailureDetails));
                    await RunInternalAsync(thisPtr, Commands.Checkpoint(new SinkResult <IMessage>(tocheckpoint, persistingFailureDetails)));
                }
                catch (Exception ex)
                {
                    Events.DeadFailure(thisPtr, ex);
                }
            }
        }
Example #6
0
        static async Task RunInternalAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            StateTransition transition;
            var             pair = new StateCommandPair(thisPtr.state, command.Type);

            if (!Transitions.TryGetValue(pair, out transition))
            {
                throw new InvalidOperationException($"Unknown state transition. In state.{thisPtr.state}, Got command.{command.Type}");
            }

            StateActions currentActions = Actions[thisPtr.state];
            StateActions nextActions    = Actions[transition.NextState];

            await currentActions.Exit(thisPtr);

            Events.StateExit(thisPtr);

            await transition.TransitionAction(thisPtr, command);

            Events.StateTransition(thisPtr, thisPtr.state, transition.NextState);

            thisPtr.state = transition.NextState;
            await nextActions.Enter(thisPtr);

            Events.StateEnter(thisPtr);
        }
Example #7
0
        static async Task UpdateEndpointAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            Preconditions.CheckNotNull(thisPtr);
            Preconditions.CheckNotNull(command);
            Preconditions.CheckArgument(command is UpdateEndpoint);
            thisPtr.retryAttempts = 0;

            Events.UpdateEndpoint(thisPtr);

            try
            {
                var update = (UpdateEndpoint)command;
                await thisPtr.processor.CloseAsync(CancellationToken.None);

                thisPtr.processor = update.Endpoint.CreateProcessor();
                Events.UpdateEndpointSuccess(thisPtr);
            }
            catch (Exception ex)
            {
                Events.UpdateEndpointFailure(thisPtr, ex);

                // TODO(manusr): If this throws, it will break the state machine
                throw;
            }
        }
Example #8
0
            static void SetProcessingInternalCounters(EndpointExecutorFsm fsm, string status, ICollection <IMessage> messages)
            {
                if (!messages.Any())
                {
                    return;
                }

                if (!Routing.PerfCounter.LogEventsProcessed(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, status, messages.Count, out string error))
                {
                    Log.LogError((int)EventIds.CounterFailure, "[LogEventsProcessedCounterFailed] {0}", error);
                }

                double totalTimeMSecs     = messages.Select(m => m.DequeuedTime).Aggregate(0D, (span, time) => span + (fsm.systemTime.UtcNow - time).TotalMilliseconds);
                long   averageLatencyInMs = totalTimeMSecs < 0 ? 0L : (long)(totalTimeMSecs / messages.Count);

                if (!Routing.PerfCounter.LogEventProcessingLatency(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, status, averageLatencyInMs, out error))
                {
                    Log.LogError((int)EventIds.CounterFailure, "[LogEventProcessingLatencyCounterFailed] {0}", error);
                }

                double messageE2EProcessingLatencyTotalMSecs = messages.Select(m => m.EnqueuedTime).Aggregate(0D, (span, time) => span + (fsm.systemTime.UtcNow - time).TotalMilliseconds);
                long   averageE2ELatencyInMs = messageE2EProcessingLatencyTotalMSecs < 0 ? 0L : (long)(messageE2EProcessingLatencyTotalMSecs / messages.Count);

                if (!Routing.PerfCounter.LogE2EEventProcessingLatency(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, status, averageE2ELatencyInMs, out error))
                {
                    Log.LogError((int)EventIds.CounterFailure, "[LogE2EEventProcessingLatencyCounterFailed] {0}", error);
                }
            }
Example #9
0
            public static void SendFailure(EndpointExecutorFsm fsm, ISinkResult <IMessage> result, Stopwatch stopwatch)
            {
                long latencyInMs = stopwatch.ElapsedMilliseconds;

                if (!Routing.PerfCounter.LogExternalWriteLatency(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, false, latencyInMs, out string error))
                {
                    Log.LogError((int)EventIds.CounterFailure, "[LogExternalWriteLatencyCounterFailed] {0}", error);
                }

                SendFailureDetails failureDetails = result.SendFailureDetails.GetOrElse(DefaultSendFailureDetails);

                foreach (InvalidDetails <IMessage> invalidDetails in result.InvalidDetailsList)
                {
                    Routing.UserAnalyticsLogger.LogInvalidMessage(fsm.Endpoint.IotHubName, invalidDetails.Item, invalidDetails.FailureKind);
                }

                Log.LogWarning(
                    (int)EventIds.SendFailure,
                    failureDetails.RawException,
                    "[SendFailure] Sending failed. SuccessfulSize: {0}, FailedSize: {1}, InvalidSize: {2}, {3}",
                    result.Succeeded.Count,
                    result.Failed.Count,
                    result.InvalidDetailsList.Count,
                    GetContextString(fsm));

                LogUnhealthyEndpointOpMonError(fsm, failureDetails.FailureKind);
            }
Example #10
0
            public static void DeadSuccess(EndpointExecutorFsm fsm, ICollection <IMessage> messages)
            {
                Preconditions.CheckArgument(fsm.Status.LastFailedRevivalTime.HasValue);

                CultureInfo culture  = CultureInfo.InvariantCulture;
                DateTime    reviveAt = fsm.Status.LastFailedRevivalTime.GetOrElse(fsm.systemTime.UtcNow).SafeAdd(fsm.config.RevivePeriod);

                Log.LogWarning(
                    (int)EventIds.DeadSuccess,
                    "[DeadSuccess] Dropped {0} messages. BatchSize: {1}, LastFailedRevivalTime: {2}, UnhealthySince: {3}, ReviveAt: {4}, {5}",
                    messages.Count,
                    messages.Count,
                    fsm.Status.LastFailedRevivalTime.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture),
                    fsm.Status.UnhealthySince.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture),
                    reviveAt.ToString(DateTimeFormat, culture),
                    GetContextString(fsm));

                SetProcessingInternalCounters(fsm, "Dropped", messages);
                SetDroppedEgressUserMetricCounter(fsm, messages);

                FailureKind failureKind = fsm.currentCheckpointCommand?.Result?.SendFailureDetails.GetOrElse(DefaultFailureDetails).FailureKind ?? FailureKind.InternalError;

                foreach (IMessage message in messages)
                {
                    Routing.UserAnalyticsLogger.LogDroppedMessage(fsm.Endpoint.IotHubName, message, fsm.Endpoint.Name, failureKind);
                }
            }
Example #11
0
 static Task ExitFailingAsync(EndpointExecutorFsm thisPtr)
 {
     thisPtr.retryTimer.Change(Timeout.InfiniteTimeSpan, Timeout.InfiniteTimeSpan);
     thisPtr.retryPeriod = Timeout.InfiniteTimeSpan;
     Events.Retry(thisPtr);
     return(TaskEx.Done);
 }
Example #12
0
 static void LogUnhealthyEndpointOpMonError(EndpointExecutorFsm fsm, FailureKind failureKind)
 {
     if (!fsm.lastFailedRevivalTime.HasValue &&
         fsm.unhealthySince.GetOrElse(DateTime.MaxValue) < fsm.systemTime.UtcNow.Subtract(LogUserAnalyticsErrorOnUnhealthySince))
     {
         Routing.UserAnalyticsLogger.LogUnhealthyEndpoint(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, failureKind);
     }
 }
Example #13
0
 static void SetDroppedEgressUserMetricCounter(EndpointExecutorFsm fsm, IEnumerable <IMessage> messages)
 {
     foreach (IGrouping <Type, IMessage> group in messages.GroupBy(m => m.MessageSource.GetType()).Where(g => g.Any()))
     {
         int count = group.Count();
         Routing.UserMetricLogger.LogEgressMetric(count, fsm.Endpoint.IotHubName, MessageRoutingStatus.Dropped, count > 0 ? group.First().ToString() : group.Key.Name);
     }
 }
Example #14
0
            public static void Dead(EndpointExecutorFsm fsm, ICollection <IMessage> messages)
            {
                Preconditions.CheckArgument(fsm.Status.LastFailedRevivalTime.HasValue);
                DateTime reviveAt = fsm.Status.LastFailedRevivalTime.GetOrElse(fsm.systemTime.UtcNow).SafeAdd(fsm.config.RevivePeriod);

                Log.LogWarning((int)EventIds.Dead, "[Dead] Dropping {0} messages. BatchSize: {1}, LastFailedRevivalTime: {2}, UnhealthySince: {3}, ReviveAt: {4}, {5}",
                               messages.Count, messages.Count, fsm.Status.LastFailedRevivalTime.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat),
                               fsm.Status.UnhealthySince.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat), reviveAt.ToString(DateTimeFormat), GetContextString(fsm));
            }
Example #15
0
 public static void CheckpointFailure(EndpointExecutorFsm fsm, Exception ex)
 {
     Log.LogError(
         (int)EventIds.CheckpointFailure,
         ex,
         "[CheckpointFailure] Checkpointing failed. CheckpointOffset: {0}, {1}",
         fsm.Status.CheckpointerStatus.Offset,
         GetContextString(fsm));
 }
Example #16
0
        static Task PrepareForReviveAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            Preconditions.CheckNotNull(thisPtr);
            Preconditions.CheckNotNull(command);
            Preconditions.CheckArgument(command is Revive);

            Events.PrepareForRevive(thisPtr);
            return(TaskEx.Done);
        }
Example #17
0
        static Task ExitDeadCheckpointingAsync(EndpointExecutorFsm thisPtr)
        {
            Preconditions.CheckNotNull(thisPtr);

            thisPtr.currentSendCommand.Complete();
            thisPtr.currentSendCommand       = null;
            thisPtr.currentCheckpointCommand = null;
            return(TaskEx.Done);
        }
Example #18
0
 public static void Checkpoint(EndpointExecutorFsm fsm, ISinkResult <IMessage> result)
 {
     Log.LogDebug(
         (int)EventIds.Checkpoint,
         "[Checkpoint] Checkpointing began. CheckpointOffset: {0}, SuccessfulSize: {1}, RemainingSize: {2}, {3}",
         fsm.Status.CheckpointerStatus.Offset,
         result.Succeeded.Count + result.InvalidDetailsList.Count,
         result.Failed.Count,
         GetContextString(fsm));
 }
Example #19
0
 public static void Send(EndpointExecutorFsm fsm, ICollection <IMessage> messages, ICollection <IMessage> admitted)
 {
     Log.LogDebug(
         (int)EventIds.Send,
         "[Send Sending began. BatchSize: {0}, AdmittedSize: {1}, MaxAdmittedOffset: {2}, {3}",
         messages.Count,
         admitted.Count,
         admitted.Max(m => m.Offset),
         GetContextString(fsm));
 }
Example #20
0
 static string GetContextString(EndpointExecutorFsm fsm)
 {
     return(string.Format(
                CultureInfo.InvariantCulture,
                "EndpointId: {0}, EndpointName: {1}, CheckpointerId: {2}, State: {3}",
                fsm.Status.Id,
                fsm.Endpoint.Name,
                fsm.Status.CheckpointerStatus.Id,
                fsm.state));
 }
Example #21
0
        static Task DieAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            Preconditions.CheckNotNull(thisPtr);
            Preconditions.CheckNotNull(command);
            Preconditions.CheckArgument(command is Die);

            thisPtr.lastFailedRevivalTime = Option.Some(thisPtr.systemTime.UtcNow);
            Events.Die(thisPtr);
            return(TaskEx.Done);
        }
Example #22
0
        static Task PrepareForCheckpointAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            Preconditions.CheckNotNull(thisPtr);
            Preconditions.CheckNotNull(command);
            Preconditions.CheckArgument(command is Checkpoint);

            var checkpoint = (Checkpoint)command;

            thisPtr.currentCheckpointCommand = checkpoint;
            return(TaskEx.Done);
        }
Example #23
0
        static Task SucceedCompleteAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            Preconditions.CheckNotNull(thisPtr);
            Preconditions.CheckNotNull(command);
            Preconditions.CheckArgument(command is Succeed);

            thisPtr.currentSendCommand.Complete();
            thisPtr.currentSendCommand       = null;
            thisPtr.currentCheckpointCommand = null;
            return(TaskEx.Done);
        }
Example #24
0
        static Task SendMessageClosedAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            Preconditions.CheckNotNull(thisPtr);
            Preconditions.CheckNotNull(command);
            Preconditions.CheckArgument(command is SendMessage);

            var send = (SendMessage)command;

            send.Complete();
            return(TaskEx.Done);
        }
Example #25
0
        static Task FailAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            Preconditions.CheckNotNull(thisPtr);
            Preconditions.CheckNotNull(command);
            Preconditions.CheckArgument(command is Fail);

            var fail = (Fail)command;

            thisPtr.retryPeriod = fail.RetryAfter;
            return(TaskEx.Done);
        }
Example #26
0
            public static void PrepareForRevive(EndpointExecutorFsm fsm)
            {
                Preconditions.CheckArgument(fsm.Status.LastFailedRevivalTime.HasValue);
                CultureInfo culture  = CultureInfo.InvariantCulture;
                DateTime    reviveAt = fsm.Status.LastFailedRevivalTime.GetOrElse(fsm.systemTime.UtcNow).SafeAdd(fsm.config.RevivePeriod);

                Log.LogInformation((int)EventIds.PrepareForRevive, "[PrepareForRevive] Attempting to bring endpoint back. LastFailedRevivalTime: {0}, UnhealthySince: {1},  ReviveAt: {2}, {3}",
                                   fsm.Status.LastFailedRevivalTime.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture),
                                   fsm.Status.UnhealthySince.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture),
                                   reviveAt.ToString(DateTimeFormat, culture), GetContextString(fsm));
            }
Example #27
0
        static Task PrepareForSendAsync(EndpointExecutorFsm thisPtr, ICommand command)
        {
            Preconditions.CheckNotNull(thisPtr);
            Preconditions.CheckNotNull(command);
            Preconditions.CheckArgument(command is SendMessage);

            var send = (SendMessage)command;

            thisPtr.currentSendCommand = send;
            return(TaskEx.Done);
        }
Example #28
0
 static void SetInvalidEgressUserMetricCounter(EndpointExecutorFsm fsm, IEnumerable <IMessage> messages)
 {
     foreach (IGrouping <Type, IMessage> group in messages.GroupBy(m => m.MessageSource.GetType()).Where(g => g.Any()))
     {
         int count = group.Count();
         if (count > 0)
         {
             Routing.UserMetricLogger.LogEgressMetric(count, fsm.Endpoint.IotHubName, MessageRoutingStatus.Invalid, group.First());
         }
     }
 }
Example #29
0
            public static void DeadFailure(EndpointExecutorFsm fsm, Exception ex)
            {
                Preconditions.CheckArgument(fsm.Status.LastFailedRevivalTime.HasValue);

                CultureInfo culture  = CultureInfo.InvariantCulture;
                DateTime    reviveAt = fsm.Status.LastFailedRevivalTime.GetOrElse(fsm.systemTime.UtcNow).SafeAdd(fsm.config.RevivePeriod);

                Log.LogError((int)EventIds.DeadFailure, ex, "[DeadFailure] Dropping messages failed. LastFailedRevivalTime: {0}, UnhealthySince: {1}, DeadTime:{2}, ReviveAt: {3}, {4}",
                             fsm.Status.LastFailedRevivalTime.ToString(), fsm.Status.LastFailedRevivalTime.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture),
                             fsm.Status.UnhealthySince.GetOrElse(Checkpointers.Checkpointer.DateTimeMinValue).ToString(DateTimeFormat, culture), reviveAt.ToString(DateTimeFormat, culture),
                             GetContextString(fsm));
            }
Example #30
0
            public static void SendFailureUnhandledException(EndpointExecutorFsm fsm, ICollection <IMessage> messages, Stopwatch stopwatch, Exception exception)
            {
                long latencyInMs = stopwatch.ElapsedMilliseconds;

                if (!Routing.PerfCounter.LogExternalWriteLatency(fsm.Endpoint.IotHubName, fsm.Endpoint.Name, fsm.Endpoint.Type, false, latencyInMs, out string error))
                {
                    Log.LogError((int)EventIds.CounterFailure, "[LogExternalWriteLatencyCounterFailed] {0}", error);
                }

                Log.LogError((int)EventIds.SendFailureUnhandledException, exception, "[SendFailureUnhandledException] Unhandled exception.  FailedSize: {0}, {1}", messages.Count, GetContextString(fsm));
                LogUnhealthyEndpointOpMonError(fsm, FailureKind.InternalError);
            }