protected override async Task ExecuteActionAsync(FabricTestContext testContext, ValidateApplicationServicesAction action, CancellationToken cancellationToken)
            {
                ThrowIf.Null(action.ApplicationName, "ApplicationName");
                TimeoutHelper helper = new TimeoutHelper(action.MaximumStabilizationTimeout);

                // TODO: make these actions which store state locally as well.
                var serviceListResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.QueryManager.GetServiceListAsync(
                        action.ApplicationName,
                        null,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                List <Task> serviceValidationTasks = new List <Task>();

                foreach (Service serviceResult in serviceListResult)
                {
                    var validateService = new ValidateServiceAction(serviceResult.ServiceName, helper.GetRemainingTime())
                    {
                        ActionTimeout  = action.ActionTimeout,
                        RequestTimeout = action.RequestTimeout,
                        CheckFlag      = action.CheckFlag
                    };

                    serviceValidationTasks.Add(testContext.ActionExecutor.RunAsync(validateService, cancellationToken));
                }

                await Task.WhenAll(serviceValidationTasks.ToArray()).ConfigureAwait(false);

                ResultTraceString = StringHelper.Format("ValidateApplicationServicesAction succeeded for {0}", action.ApplicationName);
            }
Beispiel #2
0
        internal static async Task <Node> GetNodeWithFASSecondary()
        {
            NodeList           nodeList = ActionTest.GetNodeListAsync().Result;
            ServiceReplicaList list     = null;
            FabricClient       fc       = new FabricClient();

            System.Fabric.Common.TimeoutHelper timeoutHelper = new System.Fabric.Common.TimeoutHelper(TimeSpan.FromMinutes(2));

            do
            {
                try
                {
                    list = await fc.QueryManager.GetReplicaListAsync(new Guid("00000000-0000-0000-0000-000000005000"));
                }
                catch (Exception)
                {
                    Task.Delay(TimeSpan.FromSeconds(1)).Wait();
                }
            }while (list == null && timeoutHelper.GetRemainingTime() > TimeSpan.Zero);

            if (list == null)
            {
                throw new InvalidOperationException("Could not resolve FAS primary");
            }

            Replica replica = list.Where(r => ((StatefulServiceReplica)r).ReplicaRole == ReplicaRole.ActiveSecondary).FirstOrDefault();

            return(nodeList.Where(n => n.NodeName == replica.NodeName).FirstOrDefault());
        }
Beispiel #3
0
        private void WaitForActionCount(long targetCount)
        {
            long count = 0;

            System.Fabric.Common.TimeoutHelper timeoutHelper = new System.Fabric.Common.TimeoutHelper(TimeSpan.FromMinutes(3));

            do
            {
                count = this.actionStore.GetActionCountAsync(false).GetAwaiter().GetResult();
                TestabilityTrace.TraceSource.WriteInfo(TraceType, "Current action count='{0}', target action count='{1}'", count, targetCount);
                if (count == targetCount)
                {
                    break;
                }

                Task.Delay(TimeSpan.FromSeconds(5)).Wait();
            }while (count != targetCount && timeoutHelper.GetRemainingTime() > TimeSpan.Zero);

            if (count != targetCount)
            {
                string error = string.Format(CultureInfo.InvariantCulture, "Did not reach expected target action count='{0}', current action count='{1}'", targetCount, count);
                TestabilityTrace.TraceSource.WriteError(TraceType, error);
                System.Fabric.Common.ReleaseAssert.Failfast(error);
            }
        }
            // Throws exception if validation was unsuccessful.
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, ValidateAllServicesAction action, CancellationToken token)
            {
                var timer = new TimeoutHelper(action.MaximumStabilizationTimeout);

                //// Validate system services first.
                var validateSystemServices = new ValidateSystemServicesAction(timer.GetRemainingTime())
                {
                    ActionTimeout  = action.ActionTimeout,
                    RequestTimeout = action.RequestTimeout,
                    CheckFlag      = action.CheckFlag
                };

                await testContext.ActionExecutor.RunAsync(validateSystemServices, token).ConfigureAwait(false);

                var appListResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.QueryManager.GetApplicationListAsync(
                        null,
                        string.Empty,
                        action.RequestTimeout,
                        token),
                    timer.GetRemainingTime(),
                    token).ConfigureAwait(false);

                List <Task> tasks = new List <Task>();

                foreach (Application appResult in appListResult)
                {
                    var validateAppServices = new ValidateApplicationServicesAction(appResult.ApplicationName, timer.GetRemainingTime())
                    {
                        ActionTimeout  = action.ActionTimeout,
                        RequestTimeout = action.RequestTimeout,
                        CheckFlag      = action.CheckFlag
                    };

                    var task = testContext.ActionExecutor.RunAsync(validateAppServices, token);
                    tasks.Add(task);
                    this.ActionTraceSource.WriteNoise(TraceType, "ValidateAllServicesActionHandler: Validation task added for application: {0}", appResult.ApplicationName.OriginalString);
                }

                await Task.WhenAll(tasks).ConfigureAwait(false);

                this.ResultTraceString = "ValidateAllServicesActionHandler completed for all services";
            }
Beispiel #5
0
        // This doesn't run in automation, but it is being kept here so it can be run as a small test.
        // See FaultAnalysisServiceTruncate.test for a test on this code path.
        private void TestTruncate()
        {
            this.StartActionIfItHasNotBeenStarted(Command.StuckAction);
            this.StartActionIfItHasNotBeenStarted(Command.FailoverManagerDataLoss);
            this.StartActionIfItHasNotBeenStarted(Command.InvokeDataLossMidActionTestFatal);
            this.StartActionIfItHasNotBeenStarted(Command.InvokeDataLossMidActionTestTransient);

            this.StartActionIfItHasNotBeenStarted(Command.RestartPartitionMidActionTestFatal);
            this.StartActionIfItHasNotBeenStarted(Command.RestartPartitionMidActionTestTransient);

            this.WaitForActionCount(FASConstants.TestMaxStoredActionCountValue);

            // Confirm this action is still stuck - ie that an action not in terminal state is not removed
            this.mockClient.WaitForState(MockClient.MockClientCommandInfo[Command.StuckAction], Actions.Steps.StepStateNames.LookingUpState);
            this.mockClient.WaitForState(MockClient.MockClientCommandInfo[Command.RestartPartitionMidActionTestTransient], Actions.Steps.StepStateNames.CompletedSuccessfully);

            // At this point there should be 1 command in the actionTable, StuckAction, and somewhere between 2 (Constants.TestMaxStoredActionCountValue) and 5 (the total number of possible
            // completed commands) commands in the historyTable.  In steady state, after truncates have run, the historyTable should have 2 (Constants.TestMaxStoredActionCountValue) commands remaining,
            // and they should be the ones that completed last.  Since this test only allows 1 action at a time, this will always be the 2 that were started last -
            // the RestartPartition ones.
            bool conditionSatisfied = false;
            var  timeoutHelper      = new System.Fabric.Common.TimeoutHelper(TimeSpan.FromSeconds(3 * FASConstants.TestStoredActionCleanupIntervalInSeconds));

            do
            {
                TestCommandListDescription queryDescription = new TestCommandListDescription(Query.TestCommandStateFilter.CompletedSuccessfully, Query.TestCommandTypeFilter.PartitionRestart);
                TestCommandQueryResult     queryResult      = this.mockClient.GetTestCommandListAsync(queryDescription).GetAwaiter().GetResult();
                List <TestCommandStatus>   result           = queryResult.Items;

                if (result.Count < FASConstants.TestMaxStoredActionCountValue)
                {
                    string error = string.Format(
                        CultureInfo.InvariantCulture,
                        "Number of commands in the historyTable {0} is below TestMaxStoredActionCountValue (config 'DefaultMaxStoredActionCount')",
                        result.Count);
                    TestabilityTrace.TraceSource.WriteError(TraceType, error);
                    System.Fabric.Common.ReleaseAssert.Failfast(error);
                }

                if (result.Where(c => c.TestCommandType == TestCommandType.PartitionRestart).Count() != FASConstants.TestMaxStoredActionCountValue)
                {
                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "Number of PartitionRestart results is {0}, expecting {1}, retrying", result.Count, FASConstants.TestMaxStoredActionCountValue);
                    continue;
                }

                conditionSatisfied = true;
            }while (!conditionSatisfied && timeoutHelper.GetRemainingTime() > TimeSpan.Zero);

            System.Fabric.Common.ReleaseAssert.Failfast(string.Format(CultureInfo.InvariantCulture, "Did not reach expected target action, see traces above filtered by type~ActionTest'"));

            TestabilityTrace.TraceSource.WriteInfo(TraceType, "Exiting TestTruncate");
        }
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeQuorumLossAction action, CancellationToken cancellationToken)
            {
                ThrowIf.Null(action.PartitionSelector, "PartitionSelector");

                var helper = new TimeoutHelper(action.ActionTimeout);

                // get info about the service so we can check type and trss
                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        action.PartitionSelector.ServiceName,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful)
                {
                    throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless"));
                }

                StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription;

                ReleaseAssert.AssertIf(statefulServiceDescription == null, "Service is not a stateful service");

                if (!statefulServiceDescription.HasPersistedState)
                {
                    throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "QuorumLoss", "Stateful Persistent", action.PartitionSelector.ServiceName, "Stateful In-Memory Only"));
                }

                // figure out /which/ partition to select
                var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector)
                {
                    RequestTimeout = action.RequestTimeout,
                    ActionTimeout  = helper.GetRemainingTime()
                };

                await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken);

                Guid partitionId = getPartitionStateAction.Result.PartitionId;

                // get data about replicas in that partition
                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                var removeUTRequestList = new List <Tuple <string, string> >();
                Dictionary <Tuple <string, string>, Task> removeUTTaskDictionary = new Dictionary <Tuple <string, string>, Task>();

                try
                {
                    var  stableReplicas                  = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();
                    var  stableReplicasToRemove          = new List <StatefulServiceReplica>();
                    long replicasToRestartWithoutPrimary =
                        action.QuorumLossMode == QuorumLossMode.AllReplicas
                            ? stableReplicas.Length - 1
                            : FabricCluster.GetWriteQuorumSize(replicasResult.Count);
                    foreach (var replica in stableReplicas)
                    {
                        StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                        ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful");
                        if (statefulReplica.ReplicaRole != ReplicaRole.Primary)
                        {
                            replicasToRestartWithoutPrimary--;
                        }

                        if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary)
                        {
                            stableReplicasToRemove.Add(statefulReplica);
                        }
                    }

                    // for selected replicas, block reopen so that when we restart the replica (NOT remove the replica) it doesn't come up
                    var utTaskList = new List <Task>();
                    foreach (var statefulReplica in stableReplicasToRemove)
                    {
                        string nodeName = statefulReplica.NodeName;
                        UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen");
                        behavior.AddFilterForPartitionId(partitionId);
                        string behaviorName = "BlockStatefulServiceReopen_" + nodeName;

                        removeUTRequestList.Add(new Tuple <string, string>(nodeName, behaviorName));
                        utTaskList.Add(
                            FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                () =>
                                testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                                    nodeName,
                                    behaviorName,
                                    behavior,
                                    action.RequestTimeout,
                                    cancellationToken),
                                helper.GetRemainingTime(),
                                cancellationToken));
                    }

                    await Task.WhenAll(utTaskList).ConfigureAwait(false);

                    // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                    await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken);

                    var restartReplicaTaskList = new List <Task>();
                    foreach (var statefulReplica in stableReplicasToRemove)
                    {
                        ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(action.PartitionSelector.ServiceName, partitionId), statefulReplica.Id);

                        var restartReplicaAction = new RestartReplicaAction(replicaSelector)
                        {
                            CompletionMode = CompletionMode.DoNotVerify,
                            RequestTimeout = action.RequestTimeout,
                            ActionTimeout  = helper.GetRemainingTime()
                        };

                        restartReplicaTaskList.Add(testContext.ActionExecutor.RunAsync(restartReplicaAction, cancellationToken));
                    }

                    await Task.WhenAll(restartReplicaTaskList).ConfigureAwait(false);

                    await AsyncWaiter.WaitAsync(action.QuorumLossDuration, cancellationToken).ConfigureAwait(false);

                    // validate
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => testContext.FabricClient.QueryManager.GetPartitionListAsync(
                            action.PartitionSelector.ServiceName,
                            null,
                            action.RequestTimeout,
                            cancellationToken),
                        FabricClientRetryErrors.GetPartitionListFabricErrors.Value,
                        helper.GetRemainingTime(),
                        cancellationToken).ConfigureAwait(false);

                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            ReleaseAssert.AssertIf(partition.PartitionStatus != ServicePartitionStatus.InQuorumLoss, "Partition failed to be in Quorum Loss.");
                            break;
                        }
                    }

                    foreach (var removeUTParams in removeUTRequestList)
                    {
                        var  currentParams = removeUTParams;
                        Task task          = FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                currentParams.Item1,  /*nodeName*/
                                currentParams.Item2,  /*behaviorName*/
                                action.RequestTimeout,
                                cancellationToken),
                            FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                            helper.GetRemainingTime(),
                            cancellationToken);

                        removeUTTaskDictionary[currentParams] = task;
                    }

                    await Task.WhenAll(removeUTTaskDictionary.Values).ConfigureAwait(false);

                    // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied
                    await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken);
                }
                finally
                {
                    var removeUTTaskList = new List <Task>();

                    foreach (var removeUTRequest in removeUTTaskDictionary)
                    {
                        var currentRemoveUTRequest = removeUTRequest;
                        if (currentRemoveUTRequest.Value == null || currentRemoveUTRequest.Value.IsFaulted)
                        {
                            removeUTTaskList.Add(
                                FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                    () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                        currentRemoveUTRequest.Key.Item1, /*nodeName*/
                                        currentRemoveUTRequest.Key.Item2, /*behaviorName*/
                                        action.RequestTimeout,
                                        cancellationToken),
                                    FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                                    helper.GetRemainingTime(),
                                    cancellationToken));
                        }
                    }

                    Task.WhenAll(removeUTTaskList).Wait(cancellationToken);

                    // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied
                    Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult();
                }

                action.Result          = new InvokeQuorumLossResult(getPartitionStateAction.Result);
                this.ResultTraceString = StringHelper.Format("InvokeQuorumLossAction succeeded for {0} with QuorumLossMode = {1}", partitionId, action.QuorumLossMode);
            }
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, RestartPartitionAction action, CancellationToken cancellationToken)
            {
                ThrowIf.Null(action.PartitionSelector, "partitionSelector");

                this.helper = new TimeoutHelper(action.ActionTimeout);

                // get service info so we can validate if the operation is valid
                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        action.PartitionSelector.ServiceName,
                        action.RequestTimeout,
                        cancellationToken),
                    this.helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful && action.RestartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries)
                {
                    throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "RestartPartitionMode.OnlyActiveSecondaries", "Stateful", action.PartitionSelector.ServiceName, "Stateless"));
                }

                bool hasPersistedState = false;

                if (result.Kind == ServiceDescriptionKind.Stateful)
                {
                    StatefulServiceDescription statefulDescription = result as StatefulServiceDescription;
                    ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription");
                    hasPersistedState = statefulDescription.HasPersistedState;
                }

                // now actually select a partition
                var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector)
                {
                    RequestTimeout = action.RequestTimeout,
                    ActionTimeout  = helper.GetRemainingTime()
                };

                await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken);

                Guid partitionId = getPartitionStateAction.Result.PartitionId;

                // get replicas for target
                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        action.RequestTimeout,
                        cancellationToken),
                    this.helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                // get replicas for fm in order to get the primary
                ServiceReplicaList fmReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.QueryManager.GetReplicaListAsync(
                        Constants.FmPartitionId,
                        0,
                        action.RequestTimeout,
                        cancellationToken),
                    this.helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                string fmPrimaryNodeName = string.Empty;
                var    readyFMReplicas   = fmReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();

                foreach (var replica in readyFMReplicas)
                {
                    StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                    ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica");
                    if (statefulReplica.ReplicaRole == ReplicaRole.Primary)
                    {
                        fmPrimaryNodeName = replica.NodeName;
                    }
                }

                if (string.IsNullOrEmpty(fmPrimaryNodeName))
                {
                    throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady);
                }

                ////------------------------------------------------------
                // target ut at the fm primary only
                UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "DoReconfiguration");

                behavior.AddFilterForPartitionId(partitionId);
                string behaviorName = "BlockDoReconfiguration";

                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                        fmPrimaryNodeName,
                        behaviorName,
                        behavior,
                        action.RequestTimeout,
                        cancellationToken),
                    this.helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                bool triedToRemovedBehavior = false;

                // inspect the actual replicas to restart, only operate on stable ones
                try
                {
                    var stableReplicasToRestart = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();

                    foreach (var replica in stableReplicasToRestart)
                    {
                        var currentReplica = replica;
                        if (action.RestartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries)
                        {
                            StatefulServiceReplica statefulReplica = currentReplica as StatefulServiceReplica;
                            ReleaseAssert.AssertIf(statefulReplica == null, "Stateful service replica is not StatefulServiceReplica");
                            if (statefulReplica.ReplicaRole == ReplicaRole.Primary)
                            {
                                continue;
                            }
                        }

                        if (hasPersistedState)
                        {
                            await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                () => testContext.FabricClient.FaultManager.RestartReplicaAsync(
                                    currentReplica.NodeName,
                                    partitionId,
                                    currentReplica.Id,
                                    CompletionMode.DoNotVerify,
                                    action.RequestTimeout.TotalSeconds,
                                    cancellationToken),
                                this.helper.GetRemainingTime(),
                                cancellationToken).ConfigureAwait(false);
                        }
                        else
                        {
                            await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                () => testContext.FabricClient.FaultManager.RemoveReplicaAsync(
                                    currentReplica.NodeName,
                                    partitionId,
                                    currentReplica.Id,
                                    CompletionMode.DoNotVerify,
                                    false, /*force remove*/
                                    action.RequestTimeout.TotalSeconds,
                                    cancellationToken),
                                this.helper.GetRemainingTime(),
                                cancellationToken).ConfigureAwait(false);
                        }
                    }

                    triedToRemovedBehavior = true;
                    await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                            fmPrimaryNodeName,
                            behaviorName,
                            action.RequestTimeout,
                            cancellationToken),
                        FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                        this.helper.GetRemainingTime(),
                        cancellationToken).ConfigureAwait(false);

                    // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                    await Task.Delay(TimeSpan.FromSeconds(5.0)).ConfigureAwait(false);
                }
                finally
                {
                    // TODO: Provide a way to clear all behaviors just in case.
                    if (!triedToRemovedBehavior)
                    {
                        ActionTraceSource.WriteWarning(TraceType, "Exception after adding behavior to block messages. Removing behavior synchronously");
                        FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                fmPrimaryNodeName,
                                behaviorName,
                                action.RequestTimeout,
                                cancellationToken),
                            FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                            this.helper.GetRemainingTime(),
                            cancellationToken).GetAwaiter().GetResult();

                        // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                        // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                        Task.Delay(TimeSpan.FromSeconds(5.0)).GetAwaiter().GetResult();
                    }
                }

                // -- note there's no explict validation

                // action result
                action.Result     = new RestartPartitionResult(getPartitionStateAction.Result);
                ResultTraceString = StringHelper.Format("RestartPartitionAction succeeded for {0} with RestartPartitionMode = {1}", partitionId, action.RestartPartitionMode);
            }
Beispiel #8
0
        internal async Task ExecuteIterationsWithPauseAsync()
        {
            TestabilityTrace.TraceSource.WriteNoise("StartTrek", "Enter ExecuteIterationsWithPauseAsync, datetimeutc={0}", DateTime.UtcNow);

            this.stopwatch = new Stopwatch();
            this.stopwatch.Start();

            Exception capturedException = null;

            TimeSpan waitTime = this.ChaosParameters.WaitTimeBetweenIterations;

            while (!this.cancellationToken.IsCancellationRequested)
            {
                // If this is not the beginning of a fresh Chaos run, before starting a new iteration,
                // consult the NextIterationTimeStampRD to find out if there is some
                // residual wait time from the previous iteration, if there is then wait that amount
                var nextIterationTimeStampUtc = await this.StateManager.GetUtcTimeStampAsync(
                    FASConstants.NextItearationTimeStampRDName,
                    FASConstants.NextItearationTimeStampKey,
                    this.partition,
                    this.cancellationToken).ConfigureAwait(false);

                var residualWaitTime = nextIterationTimeStampUtc.Subtract(DateTime.UtcNow);

                if (residualWaitTime > TimeSpan.Zero)
                {
                    await this.StateManager.RegisterChaosEventAndUpdateChaosStatusAsync(
                        new WaitingEvent(DateTime.UtcNow, StringHelper.Format(StringResources.ChaosInfo_ResidualWaitingFromPreviousIteration, residualWaitTime)),
                        ChaosStatus.Running,
                        this.partition,
                        this.cancellationToken,
                        () =>
                    {
                        TestabilityTrace.TraceSource.WriteInfo(TraceType, "Registering WaitingEvent for waiting '{0}' left over from the previous iteration.", residualWaitTime);
                    }).ConfigureAwait(false);

                    await Task.Delay(residualWaitTime, this.cancellationToken).ConfigureAwait(false);
                }

                try
                {
                    if (await this.IsClusterReadyForFaultsAsync(this.cancellationToken).ConfigureAwait(false))
                    {
                        System.Fabric.Common.TimeoutHelper timer = new System.Fabric.Common.TimeoutHelper(this.ChaosParameters.MaxClusterStabilizationTimeout);

                        StringBuilder validationReport = new StringBuilder();

                        var clusterReport =
                            await
                            this.validationHelper.ValidateClusterHealthAsync(
                                timer.GetRemainingTime(),
                                this.cancellationToken).ConfigureAwait(false);

                        if (clusterReport.ValidationFailed)
                        {
                            // quadratic with an upper bound of DefaultMaximumBackoffForChaosIterations
                            waitTime += (waitTime >= FASConstants.DefaultMaximumBackoffForChaosIterations) ? TimeSpan.Zero : waitTime;

                            var serviceReport =
                                await
                                this.validationHelper.ValidateAllServicesAsync(
                                    timer.GetRemainingTime(),
                                    this.cancellationToken).ConfigureAwait(false);

                            if (serviceReport.ValidationFailed)
                            {
                                TestabilityTrace.TraceSource.WriteInfo(TraceType, "Even though some services are unhealthy or unstable, going to induce faults, because the cluster is healthy.");

                                TestabilityTrace.TraceSource.WriteInfo(TraceType, "Failure reason: \n'{0}'", serviceReport.FailureReason);

                                validationReport.Append(serviceReport.FailureReason);
                            }

                            StringBuilder reportBuilder = new StringBuilder();
                            reportBuilder.Append(StringHelper.Format(StringResources.ChaosInfo_WaitingNotice, waitTime));
                            reportBuilder.AppendLine();
                            reportBuilder.AppendLine(clusterReport.FailureReason);

                            validationReport.Insert(0, reportBuilder.ToString());

                            TestabilityTrace.TraceSource.WriteInfo(TraceType, "Cluster validation failed for '{0}'.", clusterReport.FailureReason);

                            var validationFailedEvent = new ValidationFailedEvent(DateTime.UtcNow, validationReport.ToString());

                            // record validation failed event
                            await this.StateManager.RegisterChaosEventAndUpdateChaosStatusAsync(
                                validationFailedEvent,
                                ChaosStatus.Running,
                                this.partition,
                                this.cancellationToken,
                                () =>
                            {
                                FabricEvents.Events.ChaosValidationFailed(
                                    Guid.NewGuid().ToString(),
                                    validationFailedEvent.TimeStampUtc.Ticks,
                                    validationFailedEvent.Reason);
                            }).ConfigureAwait(false);

                            TestabilityTrace.TraceSource.WriteInfo(TraceType, "Pausing for '{0}' before performing next check.", waitTime);
                        }
                        else
                        {
                            waitTime = this.ChaosParameters.WaitTimeBetweenIterations;

                            await this.StateManager.RegisterCurrentStatusAsync(
                                ChaosStatus.Running,
                                this.partition,
                                this.cancellationToken).ConfigureAwait(false);

                            var timestampOfNextIteration = DateTime.UtcNow.Add(waitTime);

                            await this.StateManager.SetUtcTimeStampAsync(
                                FASConstants.NextItearationTimeStampRDName,
                                FASConstants.NextItearationTimeStampKey,
                                timestampOfNextIteration,
                                this.partition,
                                this.cancellationToken).ConfigureAwait(false);

                            await this.ExecuteFaultIterationAsync(this.cancellationToken).ConfigureAwait(false);

                            TestabilityTrace.TraceSource.WriteInfo(TraceType, "Pausing for '{0}' before executing next iteration.", waitTime);
                        }
                    }
                }
                catch (Exception exception)
                {
                    capturedException = exception;

                    // quadratic with an upper bound of DefaultMaximumBackoffForChaosIterations
                    waitTime += (waitTime >= FASConstants.DefaultMaximumBackoffForChaosIterations) ? TimeSpan.Zero : waitTime;

                    var timestampOfNextIteration = DateTime.UtcNow.Add(waitTime);

                    await this.StateManager.SetUtcTimeStampAsync(
                        FASConstants.NextItearationTimeStampRDName,
                        FASConstants.NextItearationTimeStampKey,
                        timestampOfNextIteration,
                        this.partition,
                        this.cancellationToken).ConfigureAwait(false);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "Exception occurred in the ChaosTestScenario loop: {0}, cancellationreq: {1}", capturedException, this.cancellationToken.IsCancellationRequested);
                }

                if (capturedException != null)
                {
                    var ae = capturedException as AggregateException;
                    if (ae != null)
                    {
                        capturedException = ae.Flatten().InnerException;
                    }

                    if (!ChaosUtil.IsExpectedException(capturedException))
                    {
                        string testErrorEventMessage = capturedException.Message;

                        if (capturedException is FabricChaosEngineException)
                        {
                            testErrorEventMessage = StringResources.ChaosError_UnexpectedInChaosEngine;
                        }

                        await this.StateManager.RegisterChaosEventAndUpdateChaosStatusAsync(
                            new TestErrorEvent(DateTime.UtcNow, testErrorEventMessage),
                            ChaosStatus.Running,
                            this.partition,
                            this.cancellationToken).ConfigureAwait(false);

                        TestabilityTrace.TraceSource.WriteInfo(TraceType, "Unexpected exception '{0}' was turned into TestErrorEvent.", capturedException);
                    }

                    // handled the exception, now clear it
                    capturedException = null;
                }

                if (this.testMode && (this.ChaosParameters.Context != null && this.ChaosParameters.Context.ContainsKey(ChaosConstants.FailoverAfterChaosFaultsKey)))
                {
                    this.partition.ReportFault(FaultType.Transient);
                }

                await this.StateManager.RegisterChaosEventAndUpdateChaosStatusAsync(
                    new WaitingEvent(DateTime.UtcNow, StringHelper.Format(StringResources.ChaosInfo_WaitingBetweenIterations, waitTime)),
                    ChaosStatus.Running,
                    this.partition,
                    this.cancellationToken,
                    () =>
                {
                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "Registering WaitingEvent for waiting '{0}' between iterations.", waitTime);
                }).ConfigureAwait(false);

                await Task.Delay(waitTime, this.cancellationToken).ConfigureAwait(false);

                this.IterationsCompleted++;
            }

            TestabilityTrace.TraceSource.WriteInfo(TraceType, "Session has completed. \nTotal iterations: {0}. Total elapsed time: {1}", this.IterationsCompleted, this.GetElapsedTime());
        }
        private static bool CheckRPCAccess(MachineHealthContainer machineHealthContainer)
        {
            var retryTimeout = new System.Fabric.Common.TimeoutHelper(DMConstants.BpaRpcRetryTimeout);

            SFDeployerTrace.WriteNoise(StringResources.Info_SFRpcInfo);

            Parallel.ForEach <string>(
                machineHealthContainer.GetHealthyMachineNames(),
                (string machine) =>
            {
                bool result = true;
                bool willRetry;

                do
                {
                    willRetry = false;

                    try
                    {
                        Utility.GetTempPath(machine);
                    }
                    catch (Exception ex)
                    {
                        string message;
                        if (ex is System.IO.IOException)
                        {
                            switch (ex.HResult)
                            {
                            // If new failures are discovered: https://msdn.microsoft.com/en-us/library/windows/desktop/ms681382(v=vs.85).aspx
                            case 53:         // ERROR_BAD_NETPATH
                                message   = string.Format(StringResources.Error_SFRpcIoNetpath, machine, ex.HResult);
                                willRetry = true;
                                break;

                            case 1723:         // RPC_S_SERVER_TOO_BUSY
                                message   = string.Format(StringResources.Error_SFRpcIoTooBusy, machine, ex.HResult);
                                willRetry = true;
                                break;

                            case 1727:         // RPC_S_CALL_FAILED_DNE
                                message = string.Format(StringResources.Error_SFRpcIoFailedDne, machine, ex.HResult);
                                break;

                            default:
                                message = string.Format(StringResources.Error_SFRpcIoGeneric, machine, ex.HResult);
                                break;
                            }
                        }
                        else if (ex is System.Security.SecurityException)
                        {
                            switch (ex.HResult)
                            {
                            case -2146233078:         // COR_E_SECURITY
                                message = string.Format(StringResources.Error_SFRpcSecAccess, machine, ex.HResult);
                                break;

                            default:
                                message = string.Format(StringResources.Error_SFRpcSecGeneric, machine, ex.HResult);
                                break;
                            }
                        }
                        else if (ex is NullReferenceException)
                        {
                            switch (ex.HResult)
                            {
                            case -2146232828:         // COR_E_TARGETINVOCATION
                                message = string.Format(StringResources.Error_SFRpcNullRegAccess, machine, ex.HResult);
                                break;

                            default:
                                message = string.Format(StringResources.Error_SFRpcNullGeneric, machine, ex.HResult);
                                break;
                            }
                        }
                        else
                        {
                            // This is to catch coding errors.
                            message = string.Format(StringResources.Error_SFRpcGeneric, machine, ex.HResult);
                        }

                        willRetry &= !System.Fabric.Common.TimeoutHelper.HasExpired(retryTimeout);

                        if (willRetry)
                        {
                            SFDeployerTrace.WriteWarning(message);

                            StandaloneUtility.OpenRemoteRegistryNamedPipe(machine, retryTimeout.GetRemainingTime());

                            Thread.Sleep(TimeSpan.FromSeconds(5));
                        }
                        else
                        {
                            SFDeployerTrace.WriteError(message);

                            result = false;
                        }
                    }
                }while (willRetry);

                if (!result)
                {
                    machineHealthContainer.MarkMachineAsUnhealthy(machine);
                }
            });

            return(machineHealthContainer.EnoughHealthyMachines());
        }
        public async Task <ValidationReport> EnsureStabilityWithReportAsync(TimeSpan maximumStabilizationTimeout, TimeSpan retryWait, CancellationToken ct)
        {
            TestabilityTrace.TraceSource.WriteInfo(TraceSource, "Ensuring that '{0}' is online with timeout '{1}'.", this.serviceName, maximumStabilizationTimeout);

            bool checkQuorumLoss = (this.checkFlags & ValidationCheckFlag.CheckQuorumLoss) != 0;

            // Load basic information about this service.
            TestabilityTrace.TraceSource.WriteNoise(TraceSource, "Querying basic information for {0}.", this.serviceName);
            await this.LoadPartitionAndReplicaCountAsync(ct);

            DateTime      startTime = DateTime.Now;
            TimeoutHelper timer     = new TimeoutHelper(maximumStabilizationTimeout);
            bool          success   = false;

            List <Guid>   partitionsInQuorumLoss = new List <Guid>();
            StringBuilder errorString            = new StringBuilder();
            int           retryCount             = 1;

            while (!success && timer.GetRemainingTime() > TimeSpan.Zero)
            {
                TestabilityTrace.TraceSource.WriteInfo(TraceSource, "EnsureStabilityWithReportAsync(): retryCount='{0}', timer.GetRemainingTime()='{1}'", retryCount, timer.GetRemainingTime());

                var nodes = await this.TestContext.FabricCluster.GetLatestNodeInfoAsync(this.requestTimeout, this.operationTimeout, ct);

                // Empty error string and list of partitions in quorum loss
                partitionsInQuorumLoss.Clear();
                errorString.Clear();

                success = true;
                int totalPartitionsFound = 0;

                bool stateful;
                ReleaseAssert.AssertIfNot(this.isStateful.TryGetValue(out stateful), "isStateful flag is not available");
                bool checkTarget  = (this.checkFlags & ValidationCheckFlag.CheckTargetReplicaSetSize) != 0;
                bool checkInBuild = (this.checkFlags & ValidationCheckFlag.CheckInBuildReplica) != 0;

                if (stateful)
                {
                    var partitionDictionary = await this.QueryPartitionAndReplicaResultAsyncStateful(ct);

                    totalPartitionsFound = partitionDictionary.Count();

                    foreach (KeyValuePair <Partition, StatefulServiceReplica[]> partition in partitionDictionary)
                    {
                        bool partitionIsReady = partition.Key.PartitionStatus == ServicePartitionStatus.Ready;
                        if (!partitionIsReady)
                        {
                            var message = StringHelper.Format("Partition '{0}' is not Ready", partition.Key.PartitionId());
                            TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0}", message);
                            errorString.AppendLine(message);
                        }

                        if (partition.Key.PartitionStatus != ServicePartitionStatus.InQuorumLoss)
                        {
                            int validCount      = 0;
                            int inBuildReplicas = 0;
                            foreach (StatefulServiceReplica replica in partition.Value)
                            {
                                if (replica.ReplicaStatus == ServiceReplicaStatus.Ready &&
                                    (replica.ReplicaRole == ReplicaRole.Primary || replica.ReplicaRole == ReplicaRole.ActiveSecondary))
                                {
                                    ++validCount;
                                }

                                if (replica.ReplicaStatus == ServiceReplicaStatus.InBuild)
                                {
                                    ++inBuildReplicas;
                                    var message = StringHelper.Format("Replica {0} for partition '{1}' is InBuild", replica.Id, partition.Key.PartitionId());
                                    TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0}", message);
                                    errorString.AppendLine(message);
                                }
                            }

                            bool targetAchieved = this.CheckReplicaSetSize(partition.Key.PartitionInformation.Id, validCount, startTime, nodes, errorString);
                            if (!partitionIsReady ||
                                (checkInBuild && inBuildReplicas > 0) ||
                                (checkTarget && !targetAchieved))
                            {
                                success = false;
                            }
                        }
                        else
                        {
                            partitionsInQuorumLoss.Add(partition.Key.PartitionInformation.Id);
                        }
                    }
                }
                else
                {
                    int targetInstanceCount = 0;
                    ReleaseAssert.AssertIf(!this.targetReplicaSetSize.TryGetValue(out targetInstanceCount), "targetReplicaSetSize for service: {0} should have been populated at this point.", this.serviceName);

                    bool placementConstraintsDefined = false;
                    try
                    {
                        // Get the service description to find out if there are placement constraints on the service
                        ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => this.TestContext.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                                this.serviceName,
                                this.requestTimeout,
                                ct),
                            this.operationTimeout,
                            ct).ConfigureAwait(false);

                        ThrowIf.IsTrue(result == null, "A description must be associated with the service: {0}", this.serviceName);

                        placementConstraintsDefined = !string.IsNullOrEmpty(result.PlacementConstraints);
                    }
                    catch (UnauthorizedAccessException)
                    {
                        ServiceGroupDescription groupDescription = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => this.TestContext.FabricClient.ServiceGroupManager.GetServiceGroupDescriptionAsync(
                                this.serviceName,
                                this.requestTimeout,
                                ct),
                            this.operationTimeout,
                            ct).ConfigureAwait(false);

                        ThrowIf.IsTrue(groupDescription == null, "A description must be associated with the service group: {0}", this.serviceName);

                        placementConstraintsDefined = !string.IsNullOrEmpty(groupDescription.ServiceDescription.PlacementConstraints);
                    }

                    // If a stateless service has instance count == -1 and it has placement constraints such
                    // that the possible number of instances cannot match the total number of nodes,
                    // we need to find out the number of eligible nodes for the service which is tracked by RDBug 8993319.
                    // Until RDBug 8993319 is fixed, we take the presence of placement constraints into consideration to make the
                    // validation more accurate.
                    if (targetInstanceCount == -1 && placementConstraintsDefined)
                    {
                        checkTarget = false;
                    }

                    var partitionDictionary = await this.QueryPartitionAndReplicaResultAsyncStateless(timer.GetRemainingTime(), ct);

                    totalPartitionsFound = partitionDictionary.Count();

                    foreach (KeyValuePair <Partition, StatelessServiceInstance[]> partition in partitionDictionary)
                    {
                        bool partitionIsReady = partition.Key.PartitionStatus == ServicePartitionStatus.Ready;
                        if (!partitionIsReady)
                        {
                            var message = StringHelper.Format("Partition '{0}' is not Ready", partition.Key.PartitionId());
                            TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0}", message);
                            errorString.AppendLine(message);
                        }

                        int validCount = 0;
                        foreach (StatelessServiceInstance instance in partition.Value)
                        {
                            if (instance.ReplicaStatus == ServiceReplicaStatus.Ready)
                            {
                                ++validCount;
                            }
                        }

                        bool targetAchieved = this.CheckReplicaSetSize(partition.Key.PartitionInformation.Id, validCount, startTime, nodes, errorString);
                        if (!partitionIsReady ||
                            (checkTarget && !targetAchieved))
                        {
                            success = false;
                        }
                    }
                }

                if (!this.ValidatePartitionCount(totalPartitionsFound))
                {
                    success = false;
                }

                if (partitionsInQuorumLoss.Count > 0 && checkQuorumLoss)
                {
                    string paritionIds = string.Join(",", partitionsInQuorumLoss.ToArray());
                    var    message     = StringHelper.Format("Partitions '{0}' in quorum loss for service {1}", paritionIds, this.serviceName);
                    TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0}", message);
                    errorString.AppendLine(message);
                    success = false;
                }

                if (!success)
                {
                    if (retryCount % 10 == 0)
                    {
                        TestabilityTrace.TraceSource.WriteWarning(TraceSource, "Service {0} validation failed due to issues below, will retry: \n{1}", this.serviceName, errorString);
                    }

                    // Delay before querying again so we allow some time for state to change - don't spam the node
                    await AsyncWaiter.WaitAsync(retryWait, ct).ConfigureAwait(false);
                }

                retryCount++;
            }

            if (partitionsInQuorumLoss.Count > 0)
            {
                string partitionIds = string.Join(",", partitionsInQuorumLoss.ToArray());
                TestabilityTrace.TraceSource.WriteInfo(TraceSource, "Partitions in quorum loss for service {0} are '{1}'", this.serviceName, partitionIds);

                if (checkQuorumLoss)
                {
                    throw new FabricValidationException(StringHelper.Format(StringResources.Error_PartitionsInQuorumLoss, partitionIds, this.serviceName));
                }
            }

            if (!success)
            {
                return(new ValidationReport(
                           true,
                           StringHelper.Format(StringResources.Error_ServiceNotStable, this.serviceName, maximumStabilizationTimeout, errorString)));
            }
            else
            {
                return(ValidationReport.Default);
            }
        }
        private async Task <string> GetUnhealthyItemsAsync(ServiceHealth serviceHealth, TimeoutHelper timer, CancellationToken ct)
        {
            StringBuilder healthinfo = new StringBuilder();

            foreach (var serviceHealthEvent in serviceHealth.HealthEvents)
            {
                if (serviceHealthEvent.HealthInformation.HealthState == HealthState.Ok)
                {
                    continue;
                }

                healthinfo.AppendLine(StringHelper.Format(
                                          "Service {0} health state is '{1}' with property '{2}', sourceId '{3}' and description '{4}'",
                                          this.serviceName,
                                          serviceHealthEvent.HealthInformation.HealthState,
                                          serviceHealthEvent.HealthInformation.Property,
                                          serviceHealthEvent.HealthInformation.SourceId,
                                          serviceHealthEvent.HealthInformation.Description));
            }

            foreach (var partitionHealthState in serviceHealth.PartitionHealthStates)
            {
                if (partitionHealthState.AggregatedHealthState == HealthState.Ok)
                {
                    continue;
                }

                var partitionHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () =>
                    this.TestContext.FabricClient.HealthManager.GetPartitionHealthAsync(
                        partitionHealthState.PartitionId,
                        this.requestTimeout,
                        ct),
                    FabricClientRetryErrors.GetEntityHealthFabricErrors.Value,
                    timer.GetRemainingTime(),
                    ct).ConfigureAwait(false);

                foreach (var partitionHealthEvent in partitionHealth.HealthEvents)
                {
                    if (partitionHealthEvent.HealthInformation.HealthState == HealthState.Ok)
                    {
                        continue;
                    }

                    healthinfo.AppendLine(StringHelper.Format(
                                              "Service {0}:{1} health state is '{2}' with property '{3}', sourceId '{4}' and description '{5}'",
                                              this.serviceName,
                                              partitionHealth.PartitionId,
                                              partitionHealthEvent.HealthInformation.HealthState,
                                              partitionHealthEvent.HealthInformation.Property,
                                              partitionHealthEvent.HealthInformation.SourceId,
                                              partitionHealthEvent.HealthInformation.Description));
                }

                foreach (var replicaHealthState in partitionHealth.ReplicaHealthStates)
                {
                    if (replicaHealthState.AggregatedHealthState == HealthState.Ok)
                    {
                        continue;
                    }

                    var replicaHealth = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () =>
                        this.TestContext.FabricClient.HealthManager.GetReplicaHealthAsync(
                            replicaHealthState.PartitionId,
                            replicaHealthState.Id,
                            this.requestTimeout,
                            ct),
                        FabricClientRetryErrors.GetEntityHealthFabricErrors.Value,
                        timer.GetRemainingTime(),
                        ct).ConfigureAwait(false);

                    foreach (var replicaHealthEvent in replicaHealth.HealthEvents)
                    {
                        if (replicaHealthEvent.HealthInformation.HealthState == HealthState.Ok)
                        {
                            continue;
                        }

                        healthinfo.AppendLine(StringHelper.Format(
                                                  "Service {0}:{1}:{2} health state is '{3}' with property '{4}', sourceId '{5}' and description '{6}'",
                                                  this.serviceName,
                                                  replicaHealth.PartitionId,
                                                  replicaHealth.Id,
                                                  replicaHealthEvent.HealthInformation.HealthState,
                                                  replicaHealthEvent.HealthInformation.Property,
                                                  replicaHealthEvent.HealthInformation.SourceId,
                                                  replicaHealthEvent.HealthInformation.Description));
                    }
                }
            }

            return(healthinfo.ToString());
        }
        public async Task <ValidationReport> ValidateHealthWithReportAsync(TimeSpan maximumStabilizationTimeout, TimeSpan retryWait, CancellationToken ct)
        {
            TestabilityTrace.TraceSource.WriteInfo(TraceSource, "Validating that '{0}' is healthy with timeout '{1}'.", this.serviceName, maximumStabilizationTimeout);

            TimeoutHelper timer      = new TimeoutHelper(maximumStabilizationTimeout);
            bool          success    = false;
            string        healthinfo = string.Empty;
            int           retryCount = 1;

            while (!success && timer.GetRemainingTime() > TimeSpan.Zero)
            {
                TestabilityTrace.TraceSource.WriteInfo(TraceSource, "ValidateHealthWithReportAsync(): retryCount='{0}', timer.GetRemainingTime()='{1}'", retryCount, timer.GetRemainingTime());

                healthinfo = string.Empty;

                if (this.TestContext == null)
                {
                    Console.WriteLine("testcontext is null");
                }

                ReleaseAssert.AssertIfNull(this.TestContext, "test context");
                ReleaseAssert.AssertIfNull(this.serviceName, "serviceName");
                ReleaseAssert.AssertIfNull(FabricClientRetryErrors.GetEntityHealthFabricErrors.Value, "health error code");

                ApplicationHealthPolicy healthPolicy = new ApplicationHealthPolicy();

                var serviceHealthResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () =>
                    this.TestContext.FabricClient.HealthManager.GetServiceHealthAsync(
                        this.serviceName,
                        healthPolicy,
                        this.requestTimeout,
                        ct),
                    FabricClientRetryErrors.GetEntityHealthFabricErrors.Value,
                    timer.GetRemainingTime(),
                    ct).ConfigureAwait(false);

                bool checkError   = (this.checkFlags & ValidationCheckFlag.CheckError) != 0;
                bool checkWarning = (this.checkFlags & ValidationCheckFlag.CheckWarning) != 0;

                if ((checkError && serviceHealthResult.AggregatedHealthState == HealthState.Error) ||
                    (checkWarning && serviceHealthResult.AggregatedHealthState == HealthState.Warning) ||
                    serviceHealthResult.AggregatedHealthState == HealthState.Invalid ||
                    serviceHealthResult.AggregatedHealthState == HealthState.Unknown)
                {
                    TestabilityTrace.TraceSource.WriteInfo(TraceSource, "{0} is health state is {1}. Will Retry check", this.serviceName, serviceHealthResult.AggregatedHealthState);
                    healthinfo = await this.GetUnhealthyItemsAsync(serviceHealthResult, timer, ct).ConfigureAwait(false);

                    TestabilityTrace.TraceSource.WriteInfo(TraceSource, healthinfo);
                }
                else
                {
                    success = true;
                }

                if (!success)
                {
                    if (retryCount % 10 == 0)
                    {
                        TestabilityTrace.TraceSource.WriteWarning(TraceSource, "Service {0} health validation failed due to issues below, will retry: \n{1}", this.serviceName, healthinfo);
                    }

                    // Delay before querying again so we allow some time for state to change - don't spam the node
                    await AsyncWaiter.WaitAsync(retryWait);
                }

                retryCount++;
            }

            if (!success)
            {
                return(new ValidationReport(true, StringHelper.Format(StringResources.Error_ServiceNotHealthy, serviceName, maximumStabilizationTimeout, healthinfo)));
            }
            else
            {
                return(ValidationReport.Default);
            }
        }
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, InvokeDataLossAction action, CancellationToken cancellationToken)
            {
                ThrowIf.Null(action.PartitionSelector, "PartitionSelector");

                var helper = new TimeoutHelper(action.ActionTimeout);

                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        action.PartitionSelector.ServiceName,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful)
                {
                    throw new InvalidOperationException(StringHelper.Format(StringResources.Error_InvalidServiceTypeTestability, "DataLoss", "Stateful", action.PartitionSelector.ServiceName, "Stateless"));
                }

                var getPartitionStateAction = new GetSelectedPartitionStateAction(action.PartitionSelector)
                {
                    RequestTimeout = action.RequestTimeout,
                    ActionTimeout  = helper.GetRemainingTime()
                };

                await testContext.ActionExecutor.RunAsync(getPartitionStateAction, cancellationToken).ConfigureAwait(false);

                Guid partitionId = getPartitionStateAction.Result.PartitionId;

                long preDataLossNumber = 0;

                ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.QueryManager.GetPartitionListAsync(
                        action.PartitionSelector.ServiceName,
                        null,
                        action.RequestTimeout,
                        cancellationToken),
                    helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                bool partitionFound = false;

                foreach (StatefulServicePartition partition in partitionsResult)
                {
                    if (partition.PartitionInformation.Id == partitionId)
                    {
                        preDataLossNumber = partition.PrimaryEpoch.DataLossNumber;
                        partitionFound    = true;
                        break;
                    }
                }

                if (!partitionFound)
                {
                    throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound);
                }

                long postDataLossNumber = preDataLossNumber;

                do
                {
                    ActionTraceSource.WriteInfo(
                        TraceType,
                        "InvokeDataLossAction action pending time:{0}",
                        helper.GetRemainingTime());

                    if (helper.GetRemainingTime() <= TimeSpan.Zero)
                    {
                        throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout, "InvokeDataLoss", partitionId));
                    }

                    ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => testContext.FabricClient.QueryManager.GetReplicaListAsync(
                            partitionId,
                            0,
                            action.RequestTimeout,
                            cancellationToken),
                        helper.GetRemainingTime(),
                        cancellationToken).ConfigureAwait(false);

                    ServiceReplicaList fmReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => testContext.FabricClient.QueryManager.GetReplicaListAsync(
                            Constants.FmPartitionId,
                            0,
                            action.RequestTimeout,
                            cancellationToken),
                        helper.GetRemainingTime(),
                        cancellationToken).ConfigureAwait(false);

                    string fmPrimaryNodeName = string.Empty;
                    var    readyFMReplicas   = fmReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();
                    foreach (var replica in readyFMReplicas)
                    {
                        StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                        ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica");
                        if (statefulReplica.ReplicaRole == ReplicaRole.Primary)
                        {
                            fmPrimaryNodeName = replica.NodeName;
                        }
                    }

                    if (string.IsNullOrEmpty(fmPrimaryNodeName))
                    {
                        throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady);
                    }

                    UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "DoReconfiguration");
                    behavior.AddFilterForPartitionId(partitionId);
                    string behaviorName = "BlockDoReconfiguration";

                    await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => testContext.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                            fmPrimaryNodeName,
                            behaviorName,
                            behavior,
                            action.RequestTimeout,
                            cancellationToken),
                        helper.GetRemainingTime(),
                        cancellationToken).ConfigureAwait(false);

                    // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                    // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                    await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                    bool triedToRemovedBehavior = false;

                    try
                    {
                        var stableReplicasToRemove = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();

                        ActionTraceSource.WriteInfo(TraceType, "Total number of replicas found {0}:{1}", replicasResult.Count(), stableReplicasToRemove.Count());

                        int replicasToRestartWithoutPrimary =
                            action.DataLossMode == DataLossMode.FullDataLoss
                                ? stableReplicasToRemove.Length - 1
                                : (stableReplicasToRemove.Length + 1) / 2 - 1;

                        foreach (var replica in stableReplicasToRemove)
                        {
                            var currentReplica = replica;
                            StatefulServiceReplica statefulReplica = currentReplica as StatefulServiceReplica;
                            ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful");

                            ActionTraceSource.WriteInfo(
                                TraceType,
                                "Inspecting replica {0}:{1} with role {2} and status {3} to induce data loss",
                                currentReplica.Id,
                                partitionId,
                                statefulReplica.ReplicaRole,
                                statefulReplica.ReplicaStatus);

                            if (statefulReplica.ReplicaRole != ReplicaRole.Primary)
                            {
                                replicasToRestartWithoutPrimary--;
                            }

                            if (replicasToRestartWithoutPrimary >= 0 || statefulReplica.ReplicaRole == ReplicaRole.Primary)
                            {
                                ActionTraceSource.WriteInfo(TraceType, "Removing replica {0}:{1} to induce data loss", currentReplica.Id, partitionId);

                                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                    () => testContext.FabricClient.FaultManager.RemoveReplicaAsync(
                                        currentReplica.NodeName,
                                        partitionId,
                                        currentReplica.Id,
                                        CompletionMode.DoNotVerify,
                                        false, /*force remove*/
                                        action.RequestTimeout.TotalSeconds,
                                        cancellationToken),
                                    helper.GetRemainingTime(),
                                    cancellationToken).ConfigureAwait(false);
                            }
                        }

                        triedToRemovedBehavior = true;
                        await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                fmPrimaryNodeName,
                                behaviorName,
                                action.RequestTimeout,
                                cancellationToken),
                            FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                            helper.GetRemainingTime(),
                            cancellationToken).ConfigureAwait(false);

                        // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files.
                        // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied
                        await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                        // retry check for whether data loss number has increased 5 times else do the entire process again
                        const int maxRetryCount = 5;
                        int       retryCount    = 0;
                        do
                        {
                            partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                () => testContext.FabricClient.QueryManager.GetPartitionListAsync(
                                    action.PartitionSelector.ServiceName,
                                    null,
                                    action.RequestTimeout,
                                    cancellationToken),
                                FabricClientRetryErrors.GetPartitionListFabricErrors.Value,
                                helper.GetRemainingTime(),
                                cancellationToken).ConfigureAwait(false);

                            partitionFound = false;
                            foreach (StatefulServicePartition partition in partitionsResult)
                            {
                                if (partition.PartitionInformation.Id == partitionId)
                                {
                                    postDataLossNumber = partition.PrimaryEpoch.DataLossNumber;
                                    partitionFound     = true;
                                    break;
                                }
                            }

                            if (!partitionFound)
                            {
                                throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound);
                            }

                            ActionTraceSource.WriteInfo(
                                TraceType,
                                "Checking data loss numbers for partition {0} with retryCount {1}. Current numbers {2}:{3}",
                                partitionId,
                                retryCount,
                                preDataLossNumber,
                                postDataLossNumber);

                            if (postDataLossNumber != preDataLossNumber)
                            {
                                break;
                            }

                            await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken);

                            ++retryCount;
                        } while (retryCount < maxRetryCount);
                    }
                    finally
                    {
                        if (!triedToRemovedBehavior)
                        {
                            ActionTraceSource.WriteWarning(TraceType, "Exception after adding behavior to block messages. Removing behavior synchronously");
                            FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                () => testContext.FabricClient.TestManager.RemoveUnreliableTransportBehaviorAsync(
                                    fmPrimaryNodeName,
                                    behaviorName,
                                    action.RequestTimeout,
                                    cancellationToken),
                                FabricClientRetryErrors.RemoveUnreliableTransportBehaviorErrors.Value,
                                helper.GetRemainingTime(),
                                cancellationToken).GetAwaiter().GetResult();

                            // TODO: Wait for some time so that the removal of this unreliable transport behavior can be read from the files.
                            // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successully applied
                            Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).GetAwaiter().GetResult();
                        }
                    }
                }while (postDataLossNumber == preDataLossNumber);

                ActionTraceSource.WriteInfo(
                    TraceType,
                    "InvokeDataLossAction action completed postDataLossNumber:{0}, preDataLossNumber:{1}",
                    postDataLossNumber, preDataLossNumber);

                action.Result          = new InvokeDataLossResult(getPartitionStateAction.Result);
                this.ResultTraceString = StringHelper.Format("InvokeDataLossAction succeeded for {0} with DatalossMode = {1}", partitionId, action.DataLossMode);
            }
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, RemoveReplicaAction action, CancellationToken cancellationToken)
            {
                TimeoutHelper helper = new TimeoutHelper(action.ActionTimeout);

                string          nodeName              = action.NodeName;
                Guid?           partitionId           = action.PartitionId;
                long?           replicaId             = action.ReplicaId;
                SelectedReplica replicaSelectorResult = SelectedReplica.None;

                if (string.IsNullOrEmpty(nodeName) ||
                    !partitionId.HasValue ||
                    !replicaId.HasValue)
                {
                    ThrowIf.Null(action.ReplicaSelector, "ReplicaSelector");

                    var getReplicaStateAction = new GetSelectedReplicaStateAction(action.ReplicaSelector)
                    {
                        RequestTimeout = action.RequestTimeout,
                        ActionTimeout  = helper.GetRemainingTime()
                    };

                    await testContext.ActionExecutor.RunAsync(getReplicaStateAction, cancellationToken).ConfigureAwait(false);

                    var replicaStateActionResult = getReplicaStateAction.Result;
                    ReleaseAssert.AssertIf(replicaStateActionResult == null, "replicaStateActionResult cannot be null");
                    replicaSelectorResult = replicaStateActionResult.Item1;

                    partitionId = replicaStateActionResult.Item1.SelectedPartition.PartitionId;

                    Replica replicaStateResult = replicaStateActionResult.Item2;
                    ReleaseAssert.AssertIf(replicaStateResult == null, "replicaStateResult cannot be null");

                    nodeName  = replicaStateResult.NodeName;
                    replicaId = replicaStateResult.Id;
                }

                ThrowIf.IsFalse(partitionId.HasValue, "PartitionID");
                ThrowIf.IsFalse(replicaId.HasValue, "ReplicaID");

                bool forceRemove = action.ForceRemove;

                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.ServiceManager.RemoveReplicaAsync(
                        nodeName,
                        partitionId.Value,
                        replicaId.Value,
                        forceRemove,
                        action.RequestTimeout,
                        cancellationToken),
                    FabricClientRetryErrors.RemoveReplicaErrors.Value,
                    helper.GetRemainingTime(),
                    cancellationToken);

                if (action.CompletionMode == CompletionMode.Verify)
                {
                    // Check that replica on selected node has been removed i.e. the replica id does not exist anymore.
                    bool success = false;
                    while (helper.GetRemainingTime() > TimeSpan.Zero)
                    {
                        var replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => testContext.FabricClient.QueryManager.GetReplicaListAsync(
                                partitionId.Value,
                                replicaId.Value,
                                action.RequestTimeout,
                                cancellationToken),
                            helper.GetRemainingTime(),
                            cancellationToken).ConfigureAwait(false);

                        bool dropped = replicasResult.Count == 0;
                        if (!dropped)
                        {
                            // Since we added a replica filter the result should contain the replica or none
                            ReleaseAssert.AssertIf(replicasResult.Count > 1, "More than 1 replica returned with replica filter {0}:{1}", partitionId.Value, replicaId.Value);
                            ReleaseAssert.AssertIf(replicasResult[0].Id != replicaId, "Incorrect replica Id {0} returned by query instead of {1}", replicasResult[0].Id, replicaId);
                            dropped = replicasResult[0].ReplicaStatus == ServiceReplicaStatus.Dropped;
                        }

                        if (dropped)
                        {
                            success = true;
                            break;
                        }

                        ActionTraceSource.WriteInfo(TraceSource, "Replica = {0}:{1} not yet completely removed. Retrying...", partitionId.Value, replicaId.Value);
                        await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken);
                    }

                    if (!success)
                    {
                        throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout,
                                                                       "RemoveReplica",
                                                                       StringHelper.Format("{0}:{1}", partitionId.Value, replicaId.Value)));
                    }
                }

                action.Result     = new RemoveReplicaResult(replicaSelectorResult);
                ResultTraceString = StringHelper.Format(
                    "RemoveReplicaOrInstance succeeded by removing replica {0}:{1} on node {2} with CompletionMode {3}",
                    partitionId.Value,
                    replicaId.Value,
                    nodeName,
                    action.CompletionMode);
            }
Beispiel #15
0
 public static bool HasExpired(TimeoutHelper timeoutHelper)
 {
     return(timeoutHelper.GetRemainingTime() == TimeSpan.Zero);
 }
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, RestartReplicaAction action, CancellationToken cancellationToken)
            {
                TimeoutHelper helper = new TimeoutHelper(action.ActionTimeout);

                string          nodeName              = action.NodeName;
                Guid?           partitionId           = action.PartitionId;
                long?           replicaId             = action.ReplicaId;
                SelectedReplica replicaSelectorResult = SelectedReplica.None;

                if (string.IsNullOrEmpty(nodeName) ||
                    !partitionId.HasValue ||
                    !replicaId.HasValue)
                {
                    ThrowIf.Null(action.ReplicaSelector, "ReplicaSelector");

                    var getReplicaStateAction = new GetSelectedReplicaStateAction(action.ReplicaSelector)
                    {
                        RequestTimeout = action.RequestTimeout,
                        ActionTimeout  = helper.GetRemainingTime()
                    };

                    await testContext.ActionExecutor.RunAsync(getReplicaStateAction, cancellationToken).ConfigureAwait(false);

                    var replicaStateActionResult = getReplicaStateAction.Result;
                    replicaSelectorResult = replicaStateActionResult.Item1;
                    ReleaseAssert.AssertIf(replicaSelectorResult == null, "replicaSelectorResult cannot be null");

                    partitionId = replicaStateActionResult.Item1.SelectedPartition.PartitionId;

                    Replica replicaStateResult = replicaStateActionResult.Item2;
                    ReleaseAssert.AssertIf(replicaStateResult == null, "replicaStateResult cannot be null");

                    nodeName  = replicaStateResult.NodeName;
                    replicaId = replicaStateResult.Id;
                }

                ThrowIf.IsFalse(partitionId.HasValue, "PartitionID");
                ThrowIf.IsFalse(replicaId.HasValue, "ReplicaID");

                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.ServiceManager.RestartReplicaAsync(
                        nodeName,
                        partitionId.Value,
                        replicaId.Value,
                        action.RequestTimeout,
                        cancellationToken),
                    FabricClientRetryErrors.RestartReplicaErrors.Value,
                    helper.GetRemainingTime(),
                    cancellationToken);

                if (action.CompletionMode == CompletionMode.Verify)
                {
                    // TODO: Check with failover team to see how to confirm that the replica actually restarted. We do not expose instance id for persisted replicas
                }

                action.Result          = new RestartReplicaResult(replicaSelectorResult);
                this.ResultTraceString = StringHelper.Format(
                    "RestartReplicaOrInstance succeeded by restarting replica {0}:{1} node {2} with CompletionMode {3}",
                    partitionId.Value,
                    replicaId.Value,
                    nodeName,
                    action.CompletionMode);
            }
            protected override async Task ExecuteActionAsync(FabricTestContext testContext, RestartNodeAction action, CancellationToken cancellationToken)
            {
                ActionTraceSource.WriteInfo(TraceSource, "Enter RestartNodeAction/ExecuteActionAsync: operationTimeout='{0}', requestTimeout='{1}'", action.ActionTimeout, action.RequestTimeout);

                this.helper = new TimeoutHelper(action.ActionTimeout);
                SelectedReplica selectedReplica  = SelectedReplica.None;
                string          nodeName         = action.NodeName;
                BigInteger      nodeInstance     = action.NodeInstance;
                bool            createFabricDump = action.CreateFabricDump;

                if (string.IsNullOrEmpty(nodeName))
                {
                    ThrowIf.Null(action.ReplicaSelector, "ReplicaSelector");

                    var getReplicaStateAction = new GetSelectedReplicaStateAction(action.ReplicaSelector)
                    {
                        RequestTimeout = action.RequestTimeout,
                        ActionTimeout  = helper.GetRemainingTime()
                    };

                    await testContext.ActionExecutor.RunAsync(getReplicaStateAction, cancellationToken).ConfigureAwait(false);

                    var replicaStateActionResult = getReplicaStateAction.Result;
                    ReleaseAssert.AssertIf(replicaStateActionResult == null, "replicaStateActionResult cannot be null");
                    selectedReplica = replicaStateActionResult.Item1;
                    Replica replicaStateResult = replicaStateActionResult.Item2;
                    ReleaseAssert.AssertIf(replicaStateResult == null, "replicaStateResult cannot be null");

                    nodeName     = replicaStateResult.NodeName;
                    nodeInstance = BigInteger.MinusOne;
                }

                if (nodeInstance == BigInteger.MinusOne)
                {
                    var nodeInfo = await this.GetCurrentNodeInfoAsync(testContext, nodeName, action, cancellationToken).ConfigureAwait(false);

                    nodeInstance = nodeInfo.NodeInstanceId;
                }

                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => testContext.FabricClient.FaultManager.RestartNodeUsingNodeNameAsync(
                        nodeName,
                        nodeInstance,
                        createFabricDump,
                        action.RequestTimeout,
                        cancellationToken),
                    this.helper.GetRemainingTime(),
                    cancellationToken).ConfigureAwait(false);

                if (action.CompletionMode == CompletionMode.Verify)
                {
                    bool success = false;
                    while (this.helper.GetRemainingTime() > TimeSpan.Zero)
                    {
                        var nodeInfo = await this.GetCurrentNodeInfoAsync(testContext, nodeName, action, cancellationToken).ConfigureAwait(false);

                        if (nodeInfo.NodeInstanceId > nodeInstance && nodeInfo.IsNodeUp)
                        {
                            success = true;
                            break;
                        }

                        ActionTraceSource.WriteInfo(TraceSource, "NodeName = {0} not yet restarted. '{1}' seconds remain. Retrying...", nodeName, this.helper.GetRemainingTime().TotalSeconds);
                        await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken);
                    }

                    if (!success)
                    {
                        throw new TimeoutException(StringHelper.Format(StringResources.Error_TestabilityActionTimeout,
                                                                       "RestartNode",
                                                                       nodeName));
                    }
                }

                // create result
                action.Result = new RestartNodeResult(selectedReplica, new NodeResult(nodeName, nodeInstance));

                ResultTraceString = StringHelper.Format("RestartNodeAction succeeded for {0}:{1} with CompletionMode = {2}", nodeName, nodeInstance, action.CompletionMode);
            }