Example #1
0
        public ActionStateBase(Guid operationId, ActionType actionType, ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            this.OperationId = operationId;
            this.ActionType  = actionType;
            this.ServiceInternalFaultInfo = serviceInternalFaultInfo;

            this.StateProgress = new Stack <StepStateNames>();
            this.StateProgress.Push(StepStateNames.IntentSaved);

            this.TimeReceived = DateTime.UtcNow;
        }
        // Test only, only accessible from "internal test", not from fabric client
        public async Task ProcessRetryStepCommandAsync(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            ActionStateBase actionState = new TestRetryStepState(operationId, serviceInternalFaultInfo);

            try
            {
                await this.actionStore.InitializeNewActionAsync(actionState, FASConstants.DefaultTestTimeout);

                this.Enqueue(actionState);
            }
            catch (Exception e)
            {
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e);
                throw;
            }
        }
Example #3
0
 public TestRetryStepState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo)
     : base(operationId, ActionType.TestRetryStep, serviceInternalFaultInfo)
 {
     this.RetryStepWithoutRollingBackOnFailure = true;
 }
Example #4
0
        public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            // Intentionally get stuck
            await Task.Delay(Timeout.Infinite, cancellationToken).ConfigureAwait(false);

            return(null);
        }
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeQuorumLossState state = Convert(this.State);

                // get info about the service so we can check type and trss
                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        this.partitionSelector.ServiceName,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful)
                {
                    // The message in the first arg is only for debugging, it is not returned to the user.
                    throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices);
                }

                StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription;

                ReleaseAssert.AssertIf(statefulServiceDescription == null, string.Format(CultureInfo.InvariantCulture, "{0} - Service is not a stateful service", this.State.OperationId));

                if (!statefulServiceDescription.HasPersistedState)
                {
                    // The message in the first arg is only for debugging, it is not returned to the user.
                    throw new FabricOnlyValidForStatefulPersistentServicesException("This is only valid for stateful persistent services", FabricErrorCode.OnlyValidForStatefulPersistentServices);
                }

                SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync(
                    this.FabricClient,
                    this.partitionSelector,
                    this.RequestTimeout,
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                Guid partitionId = targetPartition.PartitionId;

                // get data about replicas in that partition
                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                List <StatefulServiceReplica> tempReplicas = new List <StatefulServiceReplica>();

                foreach (var replica in replicasResult)
                {
                    StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                    ReleaseAssert.AssertIf(statefulReplica == null, "Expected stateful replica");
                    tempReplicas.Add(statefulReplica);
                }

                List <StatefulServiceReplica> targetReplicas = null;

                if (this.quorumLossMode == QuorumLossMode.AllReplicas)
                {
                    targetReplicas = tempReplicas.Where(r => r.ReplicaRole == ReplicaRole.Primary || r.ReplicaRole == ReplicaRole.ActiveSecondary).ToList();
                }
                else if (this.quorumLossMode == QuorumLossMode.QuorumReplicas)
                {
                    targetReplicas = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, tempReplicas);
                }
                else
                {
                    throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedQuorumLossMode);
                }

                if (targetReplicas == null)
                {
                    // This will cause the command to rollback and retry
                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                List <string> targetNodes = new List <string>();

                foreach (var replica in targetReplicas)
                {
                    targetNodes.Add(replica.NodeName);
                }

                List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >();

                foreach (string nodeName in targetNodes)
                {
                    UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen");
                    behavior.AddFilterForPartitionId(partitionId);

                    // ApplyingUnreliableTransport.BehaviorNamePrefix + nodeName;
                    string behaviorName = this.CreateBehaviorName(nodeName);

                    unreliableTransportInfoList.Add(new Tuple <string, string>(nodeName, behaviorName));
                }

                state.StateProgress.Push(StepStateNames.PerformingActions);

                state.Info.PartitionId             = partitionId;
                state.Info.ReplicaIds              = targetReplicas.Select(r => r.Id).ToList();
                state.Info.UnreliableTransportInfo = unreliableTransportInfoList;

                return(state);
            }
Example #6
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                NodeCommandState state = Convert(this.State);

                // The return value is ignored, this is just being used to check if the RemoveNodeState was called.
                Node queriedNode = await FaultAnalysisServiceUtility.GetNodeInfoAsync(
                    this.State.OperationId,
                    this.FabricClient,
                    state.Info.NodeName,
                    this.action.Partition,
                    this.action.StateManager,
                    this.action.StoppedNodeTable,
                    this.RequestTimeout,
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - calling StartNodeUsingNodeNameAsync, ApiInputNodeInstanceId={1}", this.State.OperationId, state.Info.InputNodeInstanceId);

                Exception exception = null;

                try
                {
                    await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.FaultManager.StartNodeUsingNodeNameAsync(
                            state.Info.NodeName,
                            state.Info.InputNodeInstanceId,
                            null,
                            0,
                            this.RequestTimeout,
                            cancellationToken),
                        FabricClientRetryErrors.StartNodeErrors.Value,
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);
                }
                catch (Exception e)
                {
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, "{0} - StartNodeUsingNodeNameAsync threw {1}", this.State.OperationId, e);
                    exception = e;
                }

                cancellationToken.ThrowIfCancellationRequested();

                SuccessRetryOrFail status = SuccessRetryOrFail.Invalid;

                if (exception != null)
                {
                    FabricException fe = exception as FabricException;
                    if (fe != null)
                    {
                        status = this.HandleFabricException(fe, state);
                    }
                    else
                    {
                        TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, "{0} - StartNodeUsingNodeNameAsync threw non-FabricException with ErrorCode={1}", this.State.OperationId, exception);
                        status = SuccessRetryOrFail.RetryStep;
                    }
                }
                else
                {
                    // success
                    status = SuccessRetryOrFail.Success;

                    await FaultAnalysisServiceUtility.SetStoppedNodeStateAsync(
                        this.action.State.OperationId,
                        this.action.Partition,
                        this.action.StateManager,
                        this.action.StoppedNodeTable,
                        state.Info.NodeName,
                        false,
                        cancellationToken).ConfigureAwait(false);
                }

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                if (status == SuccessRetryOrFail.RetryStep)
                {
                    throw new RetrySameStepException("retrystep", exception);
                }
                else if (status == SuccessRetryOrFail.Fail)
                {
                    throw new FatalException("fatal", exception);
                }
                else if (status == SuccessRetryOrFail.Success)
                {
                    // no-op
                }
                else
                {
                    ReleaseAssert.Failfast(string.Format(CultureInfo.InvariantCulture, "This condition should not have been hit.  OperationId: {0}", this.State.OperationId));
                }

                await this.ValidateAsync(this.FabricClient, state, cancellationToken).ConfigureAwait(false);

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);
                return(state);
            }
 public InvokeDataLossState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo, PartitionSelector partitionSelector, DataLossMode dataLossMode)
     : base(operationId, ActionType.InvokeDataLoss, serviceInternalFaultInfo)
 {
     this.Info = new InvokeDataLossInfo(partitionSelector, dataLossMode);
 }
Example #8
0
        private async Task CleanupStepAsync(
            FabricClient fabricClient,
            FabricTestAction action,
            ActionStateBase actionState,
            CancellationToken cancellationToken,
            ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            StepStateNames state = actionState.StateProgress.Peek();

            TestabilityTrace.TraceSource.WriteInfo(TraceType, "Cleaning up state={0}, name={1}, key={2}", state, actionState.ActionType, actionState.OperationId);

            StepBase actionUnit = null;

            actionUnit = action.GetStep(fabricClient, actionState, state, cancellationToken);

            TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Cleaning up {1}", actionState.OperationId, actionUnit.StepName);

            try
            {
                while (true)
                {
                    cancellationToken.ThrowIfCancellationRequested();
                    RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.InnerCleanupLoop).ConfigureAwait(false);

                    if (readRollbackState == RollbackState.RollingBackDueToUserCancel)
                    {
                        // Do nothing, already rolling back
                        TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Inner cleanup loop read RollingBackDueToUserCancel", actionState.OperationId);
                    }
                    else if (readRollbackState == RollbackState.RollingBackForce)
                    {
                        TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Inner cleanup loop read RollingBackForce", actionState.OperationId);
                        break;
                    }

                    try
                    {
                        await actionUnit.CleanupAsync(cancellationToken).ConfigureAwait(false);

                        actionState.StateProgress.Pop();
                        break;
                    }
                    catch (Exception cleanupException)
                    {
                        TestabilityTrace.TraceSource.WriteWarning(
                            TraceType,
                            "{0} - Cleanup of action type={1}, failed with {2}, retrying",
                            actionState.OperationId,
                            actionState.ActionType,
                            cleanupException);
                    }

                    await Task.Delay(TimeSpan.FromSeconds(this.commandStepRetryBackoffInSeconds), cancellationToken).ConfigureAwait(false);
                }
            }
            catch (Exception e)
            {
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - CleanupStepAsync, error: {1}", actionState.OperationId, e.ToString());
                throw;
            }
        }
Example #9
0
        private async Task RunStepAsync(
            FabricClient fabricClient,
            FabricTestAction action,
            ActionStateBase actionState,
            CancellationToken cancellationToken,
            ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            StepStateNames state = actionState.StateProgress.Peek();

            TestabilityTrace.TraceSource.WriteInfo(TraceType, "Running state={0}, name={1}, key={2}", state, actionState.ActionType, actionState.OperationId);

            StepBase actionUnit = null;

            actionUnit = action.GetStep(fabricClient, actionState, state, cancellationToken);

            TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Running {1}", actionState.OperationId, actionUnit.StepName);
            try
            {
                while (true)
                {
                    cancellationToken.ThrowIfCancellationRequested();
                    RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.InnerForwardLoop).ConfigureAwait(false);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - readRollbackState={1}", actionState.OperationId, readRollbackState);

                    // If RetryStepWithoutRollingbackOnFailure == true, then don't allow graceful user cancel
                    if (!actionState.RetryStepWithoutRollingBackOnFailure && (readRollbackState == RollbackState.RollingBackDueToUserCancel))
                    {
                        TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - read RollingBackDueToUserCancel breaking from Run loop inside RunStepAsync()", actionState.OperationId);
                        break;
                    }

                    // RollingBackForce always stops execution
                    if (readRollbackState == RollbackState.RollingBackForce)
                    {
                        break;
                    }

                    Exception runException = null;
                    try
                    {
                        TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0}, {1} - calling Step.Run()", actionState.OperationId, actionState.ActionType);
                        ActionStateBase newContext = await actionUnit.RunAsync(cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false);

                        TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0}, {1} - calling break after run", actionState.OperationId, actionState.ActionType);
                        break;
                    }
                    catch (Exception runExceptionTemp)
                    {
                        runException = runExceptionTemp;
                        TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0}, {1} - runException {2}", actionState.OperationId, actionState.ActionType, runException);
                        if (actionState.RetryStepWithoutRollingBackOnFailure)
                        {
                            // trace and loop.  Should have /backoff/?
                            TestabilityTrace.TraceSource.WriteWarning(
                                TraceType,
                                "{0}, {1} has RetryStepWithoutRollingbackOnFailure set to true, retrying step name='{2}'.  Caught exception: {3}",
                                actionState.OperationId,
                                actionState.ActionType,
                                actionUnit.StepName,
                                runException);

                            this.ProcessRetryStepExceptions(actionState.OperationId, runException);
                        }
                        else
                        {
                            throw;
                        }
                    }

                    if (runException != null)
                    {
                        await Task.Delay(TimeSpan.FromSeconds(this.commandStepRetryBackoffInSeconds), cancellationToken).ConfigureAwait(false);
                    }
                }
            }
            catch (Exception e)
            {
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - RunState, error: {1}", actionState.OperationId, e.ToString());
                throw;
            }
        }
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "Inside CollectingState, service={0}", this.partitionSelector.ServiceName);
                RestartPartitionState state = Convert(this.State);

                // Get service info and validate if the parameters are valid
                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        this.partitionSelector.ServiceName,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful && this.restartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries)
                {
                    // The message in the first arg is only for debugging, it is not returned to the user.
                    string debugText = string.Format(CultureInfo.InvariantCulture, "RestartPartition: for stateless services only RestartPartitionMode.AllReplicasOrInstances is valid");
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, debugText);
                    throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, debugText);
                }

                bool hasPersistedState = false;

                if (result.Kind == ServiceDescriptionKind.Stateful)
                {
                    StatefulServiceDescription statefulDescription = result as StatefulServiceDescription;
                    ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription");
                    hasPersistedState = statefulDescription.HasPersistedState;
                }

                SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync(
                    this.FabricClient,
                    this.partitionSelector,
                    this.RequestTimeout,
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                Guid partitionId = targetPartition.PartitionId;

                // get replicas for target
                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                // get replicas for fm in order to get the primary
                ServiceReplicaList failoverManagersReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        FASConstants.FmPartitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                string failoverManagerPrimaryNodeName = string.Empty;
                var    readyFMReplicas = failoverManagersReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();

                foreach (var replica in readyFMReplicas)
                {
                    StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                    ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica");
                    if (statefulReplica.ReplicaRole == ReplicaRole.Primary)
                    {
                        failoverManagerPrimaryNodeName = replica.NodeName;
                    }
                }

                if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName))
                {
                    throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady);
                }

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary is at node={1}", this.State.OperationId, failoverManagerPrimaryNodeName);
                string behaviorName = RestartingSelectedReplicas.UTBehaviorPrefixName + "_" + this.State.OperationId;
                List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >();

                unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName));

                state.StateProgress.Push(StepStateNames.PerformingActions);
                state.Info.PartitionId             = partitionId;
                state.Info.NodeName                = failoverManagerPrimaryNodeName;
                state.Info.HasPersistedState       = hasPersistedState;
                state.Info.UnreliableTransportInfo = unreliableTransportInfo;

                return(state);
            }
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                RestartPartitionState state = Convert(this.State);

                Guid   partitionId       = state.Info.PartitionId;
                bool   hasPersistedState = state.Info.HasPersistedState;
                string failoverManagerPrimaryNodeName = state.Info.NodeName;
                string behaviorName = state.Info.UnreliableTransportInfo.First().Item2;

                System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration");
                behavior.AddFilterForPartitionId(partitionId);

                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                        failoverManagerPrimaryNodeName,
                        behaviorName,
                        behavior,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applied UT on partitionId {1}, node={2}", this.State.OperationId, partitionId, failoverManagerPrimaryNodeName);
                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                var stableReplicasToRestart = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();

                foreach (var replica in stableReplicasToRestart)
                {
                    if (this.restartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries)
                    {
                        StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                        ReleaseAssert.AssertIf(statefulReplica == null, "Stateful service replica is not StatefulServiceReplica");
                        if (statefulReplica.ReplicaRole == ReplicaRole.Primary)
                        {
                            continue;
                        }
                    }

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - restarting replica partition={1}, node={2}, replica id={3}", this.State.OperationId, partitionId, replica.NodeName, replica.Id);
                    if (hasPersistedState)
                    {
                        await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => this.FabricClient.ServiceManager.RestartReplicaAsync(
                                replica.NodeName,
                                partitionId,
                                replica.Id,
                                this.RequestTimeout,
                                cancellationToken),
                            FabricClientRetryErrors.RestartReplicaErrors.Value,
                            this.OperationTimeout,
                            cancellationToken).ConfigureAwait(false);
                    }
                    else
                    {
                        await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                            () => this.FabricClient.ServiceManager.RemoveReplicaAsync(
                                replica.NodeName,
                                partitionId,
                                replica.Id,
                                this.RequestTimeout,
                                cancellationToken),
                            FabricClientRetryErrors.RemoveReplicaErrors.Value,
                            this.OperationTimeout,
                            cancellationToken).ConfigureAwait(false);
                    }
                }

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                await RemoveUnreliableTransportAsync(this.State, this.FabricClient, this.RequestTimeout, this.OperationTimeout, cancellationToken);

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);

                return(state);
            }
Example #12
0
 public InvokeQuorumLossState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo, PartitionSelector partitionSelector, QuorumLossMode quorumLossMode, TimeSpan quorumLossDuration)
     : base(operationId, ActionType.InvokeQuorumLoss, serviceInternalFaultInfo)
 {
     this.Info = new InvokeQuorumLossInfo(partitionSelector, quorumLossMode, quorumLossDuration);
 }
        public async Task ProcessStopNodeCommandAsync(Guid operationId, string nodeName, BigInteger nodeInstanceId, int durationInSeconds, TimeSpan timeout, ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - ProcessStopNodeCommandAsync, duration is {1}", operationId, durationInSeconds);
            this.entitySynch.NodeSynchronizer.Add(nodeName);

            NodeCommandState actionState = new NodeCommandState(ActionType.StopNode, operationId, this.entitySynch.NodeSynchronizer, serviceInternalFaultInfo, nodeName, nodeInstanceId, durationInSeconds);

            try
            {
                // After this call finishes the intent has been persisted
                await this.actionStore.InitializeNewActionAsync(actionState, timeout);

                this.Enqueue(actionState);
            }
            catch (Exception e)
            {
                this.entitySynch.NodeSynchronizer.Remove(nodeName);
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e);
                throw;
            }
        }
        // Use this method signature for now until the actual client interface is decided
        public async Task ProcessRestartPartitionCommandAsync(Guid operationId, PartitionSelector partitionSelector, RestartPartitionMode restartPartitionMode, TimeSpan timeout, ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            ThrowIfRestartPartitionModeInvalid(restartPartitionMode);

            RestartPartitionState actionState = new RestartPartitionState(operationId, serviceInternalFaultInfo, partitionSelector, restartPartitionMode);

            try
            {
                // After this call finishes the intent has been persisted
                await this.actionStore.InitializeNewActionAsync(actionState, timeout);

                this.Enqueue(actionState);
            }
            catch (Exception e)
            {
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e);
                throw;
            }
        }
Example #15
0
 public StuckState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo)
     : base(operationId, ActionType.TestStuck, serviceInternalFaultInfo)
 {
 }
 public RestartPartitionState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo, PartitionSelector partitionSelector, RestartPartitionMode restartPartitionMode)
     : base(operationId, ActionType.RestartPartition, serviceInternalFaultInfo)
 {
     this.Info = new RestartPartitionInfo(partitionSelector, restartPartitionMode);
 }
Example #17
0
        private async Task RunStateMachineAsync(
            FabricClient fabricClient,
            FabricTestAction action,
            ActionStateBase actionState,
            ServiceInternalFaultInfo serviceInternalFaultInfo,
            CancellationToken cancellationToken)
        {
            if (actionState.StateProgress == null || actionState.StateProgress.Count == 0)
            {
                ReleaseAssert.AssertIf(actionState.StateProgress == null || actionState.StateProgress.Count == 0, "ActionProgress should not be null or empty");
            }

            Exception actionError = null;

            if (actionState.RollbackState == RollbackState.NotRollingBack ||
                (actionState.RollbackState != RollbackState.RollingBackForce && actionState.RetryStepWithoutRollingBackOnFailure))
            {
                // TODO: should also include Error
                while (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully)
                {
                    cancellationToken.ThrowIfCancellationRequested();

                    RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.ForwardLoop).ConfigureAwait(false);

                    if ((readRollbackState == RollbackState.RollingBackForce) ||
                        ((readRollbackState == RollbackState.RollingBackDueToUserCancel) && !actionState.RetryStepWithoutRollingBackOnFailure))
                    {
                        break;
                    }

                    try
                    {
                        await this.RunStepAsync(fabricClient, action, actionState, cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false);

                        ActionTest.PerformInternalServiceFaultIfRequested(actionState.OperationId, serviceInternalFaultInfo, actionState, cancellationToken);
                        if (actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully)
                        {
                            TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - completed successfully, clearing ErrorCausingRollback", actionState.OperationId);
                            actionState.ErrorCausingRollback = 0;
                        }

                        actionState.TimeStopped = DateTime.UtcNow;
                        await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false);
                    }
                    catch (RetrySameStepException)
                    {
                        // Retry the command in the same step - do not rollback or go forward, and do not call ActionStore.UpdateActionStateAsync().
                        TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - threw RetrySameStepException, retrying state {1} ", actionState.StateProgress.Peek());
                    }
                    catch (FabricNotPrimaryException)
                    {
                        throw;
                    }
                    catch (FabricObjectClosedException)
                    {
                        throw;
                    }
                    catch (Exception e)
                    {
                        TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - RunStateMachineAsync caught: {1}", actionState.OperationId, e.ToString());
                        readRollbackState = this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.ForwardLoopExceptionBlock).GetAwaiter().GetResult();

                        // 1st line: if this is a force rollback (RollingBackForce), just exit
                        // 2nd line: if !RetryStepWithoutRollingBackOnFailure and there was a graceful cancel then exit this block and proceed to the rollback code block below.
                        // If RetryStepWithoutRollingBackOnFailure is true, which it is only for the node steps today,  then first call HandleRollback to translate the exception.
                        if ((readRollbackState == RollbackState.RollingBackForce) ||
                            ((readRollbackState == RollbackState.RollingBackDueToUserCancel) && !actionState.RetryStepWithoutRollingBackOnFailure))
                        {
                            break;
                        }
                        else
                        {
                            bool isRetryable = this.HandleRollback(actionState.OperationId, e);
                            if (isRetryable)
                            {
                                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - observed retryable exception, will retry action.  Exception: {1}", actionState.OperationId, e.ToString());
                                actionState.RollbackState = RollbackState.RollingBackAndWillRetryAction;
                            }
                            else
                            {
                                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - observed non-retryable exception.  Exception: {1}", actionState.OperationId, e.ToString());
                                actionState.RollbackState = RollbackState.RollingBackAndWillFailAction;
                            }
                        }

                        actionError = e;
                        break;
                    }
                }
            }

            if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction ||
                actionState.RollbackState == RollbackState.RollingBackAndWillFailAction ||
                (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel && (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully)))
            {
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Rollingback type={1}", actionState.OperationId, actionState.ActionType);
                if (!this.isTestMode && actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully)
                {
                    string error = string.Format(CultureInfo.InvariantCulture, "{0} - state should not be CompletedSuccessfully", actionState.OperationId);
                    TestabilityTrace.TraceSource.WriteError(TraceType, error);
                    ReleaseAssert.Failfast(error);
                }

                // If actionError is not null it means we are currently running a resumed rollback.  In that case the ErrorCausingRollback must have
                // already been set.
                if (actionError != null)
                {
                    actionState.ErrorCausingRollback = TranslateRollbackError(actionError.HResult);
                    TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Translated ErrorCausingRollback ={1}", actionState.OperationId, actionState.ErrorCausingRollback);
                }

                if (this.isTestMode && actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully)
                {
                    // In test mode it's intentionally possible to fault an action after it's completed its work, but before the state name has been updated.
                    actionState.StateProgress.Pop();
                }

                await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false);

                try
                {
                    while (actionState.StateProgress.Peek() != StepStateNames.IntentSaved &&
                           actionState.StateProgress.Peek() != StepStateNames.Failed)
                    {
                        cancellationToken.ThrowIfCancellationRequested();
                        RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.OuterCleanupLoop).ConfigureAwait(false);

                        if (readRollbackState == RollbackState.RollingBackDueToUserCancel)
                        {
                            // Do nothing, already rolling back - debug only
                            TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Read RollingBackDueToUserCancel in outer rollback loop", actionState.OperationId);
                        }
                        else if (readRollbackState == RollbackState.RollingBackForce)
                        {
                            TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Read RollingBackForce in outer rollback loop", actionState.OperationId);
                            break;
                        }

                        StepStateNames currentStateName = actionState.StateProgress.Peek();
                        TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - DEBUG - Rollback path loop, current state {1}", actionState.OperationId, actionState.StateProgress.Peek());

                        try
                        {
                            await this.CleanupStepAsync(fabricClient, action, actionState, cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false);

                            await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false);
                        }
                        catch (FabricNotPrimaryException)
                        {
                            throw;
                        }
                        catch (FabricObjectClosedException)
                        {
                            throw;
                        }
                        catch (Exception e)
                        {
                            ReleaseAssert.Failfast("Unexpected exception, RunStateAsync for cleanup should have handled {0}", e);
                        }
                    }

                    // If this is true rollback is finished.  If it is retryable set the state to LookingUpState
                    if (actionState.StateProgress.Peek() == StepStateNames.IntentSaved)
                    {
                        if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction)
                        {
                            actionState.StateProgress.Push(StepStateNames.LookingUpState);
                            actionState.ClearInfo();
                        }
                        else if (actionState.RollbackState == RollbackState.RollingBackAndWillFailAction)
                        {
                            actionState.StateProgress.Push(StepStateNames.Failed);

                            actionState.RollbackState = RollbackState.NotRollingBack;
                            actionState.TimeStopped   = DateTime.UtcNow;
                        }
                        else if (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel)
                        {
                            actionState.StateProgress.Push(StepStateNames.Failed);

                            actionState.TimeStopped = DateTime.UtcNow;
                        }
                        else if (actionState.RollbackState == RollbackState.RollingBackForce)
                        {
                            actionState.StateProgress.Push(StepStateNames.Failed);

                            actionState.TimeStopped = DateTime.UtcNow;
                        }
                        else
                        {
                            string error = string.Format(CultureInfo.InvariantCulture, "{0} - RollbackState == NotRollingBack not expected", actionState.OperationId);
                            ReleaseAssert.Failfast(error);
                        }
                    }
                    else if (actionState.RollbackState == RollbackState.RollingBackForce)
                    {
                        actionState.StateProgress.Push(StepStateNames.Failed);
                        actionState.TimeStopped = DateTime.UtcNow;
                    }
                }
                catch (OperationCanceledException)
                {
                    // This means the cancellation token is set, not that an api call observed an E_ABORT
                    throw;
                }
                catch (FabricNotPrimaryException)
                {
                    throw;
                }
                catch (FabricObjectClosedException)
                {
                    throw;
                }
                catch (Exception e)
                {
                    ReleaseAssert.Failfast("Unexpected exception, RunStateAsync for cleanup should have handled {0}", e);
                }

                TestabilityTrace.TraceSource.WriteInfo(
                    TraceType,
                    "{0} - Action failed, type='{1}', will retry={2}, RollbackState={3}",
                    actionState.OperationId,
                    actionState.ActionType,
                    actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction ? "true" : "false",
                    actionState.RollbackState);

                await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false);
            }
            else if (actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully)
            {
                // user cancelled, but action/command completed anyways before cancellation was checked.
                TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} - Action type '{1}' completed successfully, not updating again ", actionState.OperationId, actionState.ActionType);
            }
            else if ((actionState.StateProgress.Peek() == StepStateNames.IntentSaved) &&
                     (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel))
            {
                actionState.StateProgress.Push(StepStateNames.Failed);
                actionState.TimeStopped = DateTime.UtcNow;

                await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false);
            }
            else if (actionState.RollbackState == RollbackState.RollingBackForce)
            {
                // Note: unlike the case above this does not have a state of IntentSaved as a requirement since a force rollback is an abort and does run the steps in reverse.
                // It is possible for the StateProgress to be CompletedSuccessfully here, since we want to exit as quickly as possible.  In that case, the block 2 blocks above handles it -
                // we do nothing extra, and the command finishes executing.  If the user calls an api for information on this command, we translate the state to ForceCancelled if state is a terminal state
                // and RollbackState is RollingBackForce.  See ActionStore.MatchesStateFilter().
                actionState.TimeStopped = DateTime.UtcNow;
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "Bottom of Engine.RunAsync() - state is={0}, rollbackState={1}", actionState.StateProgress.Peek().ToString(), actionState.RollbackState.ToString());
                actionState.StateProgress.Push(StepStateNames.Failed);
                await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false);
            }
            else
            {
                string unexpectedError = string.Format(CultureInfo.InvariantCulture, "Unexpected case reached, state is={0}, rollbackState={1}", actionState.StateProgress.Peek().ToString(), actionState.RollbackState.ToString());
                TestabilityTrace.TraceSource.WriteError(TraceType, "{0}", unexpectedError);
                ReleaseAssert.Failfast(unexpectedError);
            }
        }
        public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Inside TestRetryStepStepTwo - this should retry w/o rollback when exception is thrown", this.State.OperationId);
            TestRetryStepState castedState = Convert(this.State);

            // Simulate work
            await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false);

            TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Inside TestRetryStepStepTwo - before PerformInternalServiceFaultIfRequested", this.State.OperationId);
            ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);
            TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Inside TestRetryStepStepTwo - after PerformInternalServiceFaultIfRequested", this.State.OperationId);

            this.State.StateProgress.Push(StepStateNames.CompletedSuccessfully);

            return(this.State);
        }
Example #19
0
        public async Task RunAsync(FabricClient fc, FabricTestAction action, ActionStateBase actionState, ServiceInternalFaultInfo serviceInternalFaultInfo, CancellationToken cancellationToken)
        {
            TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Inside RunAsync of Engine, entering state machine", actionState.OperationId);
            try
            {
                do
                {
                    cancellationToken.ThrowIfCancellationRequested();
                    RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.OuterLoop).ConfigureAwait(false);

                    // For the non-force case we need to cleanup, so that is why there's no break statement in that case.
                    if (readRollbackState == RollbackState.RollingBackForce)
                    {
                        actionState.StateProgress.Push(StepStateNames.Failed);
                        await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false);

                        break;
                    }

                    await this.RunStateMachineAsync(fc, action, actionState, serviceInternalFaultInfo, cancellationToken).ConfigureAwait(false);

                    if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction)
                    {
                        actionState.ErrorCausingRollback = 0;
                        int pauseTime = this.random.Next(10, 60);
                        TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Pausing for {1} seconds before retrying", actionState.OperationId, pauseTime);

                        // Clear the rollback state so it will go forward when it resumes.
                        actionState.RollbackState = RollbackState.NotRollingBack;
                        await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false);

                        await Task.Delay(TimeSpan.FromSeconds(pauseTime), cancellationToken).ConfigureAwait(false);
                    }
                }while (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully &&
                        actionState.StateProgress.Peek() != StepStateNames.Failed);
            }
            catch (FabricNotPrimaryException notPrimary)
            {
                FaultAnalysisServiceUtility.TraceFabricNotPrimary(actionState.OperationId, notPrimary);
            }
            catch (FabricObjectClosedException objectClosed)
            {
                FaultAnalysisServiceUtility.TraceFabricObjectClosed(actionState.OperationId, objectClosed);
            }
            catch (Exception e)
            {
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} caught exception - {1}", actionState.OperationId, e);
                throw;
            }

            TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Exiting state machine", actionState.OperationId);
        }
Example #20
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeDataLossState state = Convert(this.State);

                PartitionSelector partitionSelector   = state.Info.PartitionSelector;
                DataLossMode      dataLossMode        = state.Info.DataLossMode;
                long   preDataLossNumber              = state.Info.DataLossNumber;
                string failoverManagerPrimaryNodeName = state.Info.NodeName;
                Guid   partitionId          = state.Info.PartitionId;
                string behaviorName         = state.Info.UnreliableTransportInfo.First().Item2;
                int    targetReplicaSetSize = state.Info.TargetReplicaSetSize;

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying UT, partitionId={1}", this.State.OperationId, partitionId);
                System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration");
                behavior.AddFilterForPartitionId(partitionId);

                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                        failoverManagerPrimaryNodeName,
                        behaviorName,
                        behavior,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                List <StatefulServiceReplica> replicaList = new List <StatefulServiceReplica>();

                foreach (var replica in replicasResult)
                {
                    StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                    ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful");
                    replicaList.Add(statefulReplica);
                }

                // Select target replicas based on the DataLosMode
                List <StatefulServiceReplica> targets = null;

                if (dataLossMode == DataLossMode.FullDataLoss)
                {
                    targets = GetReplicasForFullDataLoss(replicaList);
                }
                else if (dataLossMode == DataLossMode.PartialDataLoss)
                {
                    targets = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, replicaList);
                }
                else
                {
                    throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode);
                }

                if (targets == null)
                {
                    // This will cause the command to rollback and retry
                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                foreach (var replica in targets)
                {
                    TestabilityTrace.TraceSource.WriteInfo(
                        StepBase.TraceType,
                        "{0} - Removing replica {1} in partition {2} with role {3} and status {4} to induce data loss",
                        this.State.OperationId,
                        replica.Id,
                        partitionId,
                        replica.ReplicaRole,
                        replica.ReplicaStatus);

                    await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.ServiceManager.RemoveReplicaAsync(
                            replica.NodeName,
                            partitionId,
                            replica.Id,
                            this.RequestTimeout,
                            cancellationToken),
                        FabricClientRetryErrors.RemoveReplicaErrors.Value,
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);
                }

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                await this.WaitForAllTargetReplicasToGetDroppedAsync(partitionId, targets, cancellationToken).ConfigureAwait(false);

                await RemoveUnreliableTransportAsync(this.FabricClient, failoverManagerPrimaryNodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false);

                bool          dataLossWasSuccessful = false;
                TimeoutHelper timeoutHelper         = new TimeoutHelper(TimeSpan.FromSeconds(30));

                do
                {
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.QueryManager.GetPartitionListAsync(
                            this.partitionSelector.ServiceName,
                            null,
                            this.RequestTimeout,
                            cancellationToken),
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);

                    bool partitionFound     = false;
                    long postDataLossNumber = 0;
                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            postDataLossNumber = partition.PrimaryEpoch.DataLossNumber;
                            partitionFound     = true;
                            break;
                        }
                    }

                    if (!partitionFound)
                    {
                        throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound);
                    }

                    TestabilityTrace.TraceSource.WriteInfo(
                        StepBase.TraceType,
                        "{0} - Checking data loss numbers for partition {1} with remaining time {2}. Current numbers {3}:{4}",
                        this.State.OperationId,
                        partitionId,
                        timeoutHelper.GetRemainingTime(),
                        preDataLossNumber,
                        postDataLossNumber);

                    if (postDataLossNumber != preDataLossNumber)
                    {
                        dataLossWasSuccessful = true;
                        break;
                    }

                    await System.Fabric.Common.AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(this.dataLossCheckPollIntervalInSeconds), cancellationToken).ConfigureAwait(false);
                }while (timeoutHelper.GetRemainingTime() > TimeSpan.Zero);

                if (!dataLossWasSuccessful)
                {
                    // This is only viewable internally for debug.  This will cause a retry of the whole flow.
                    string error = string.Format(
                        CultureInfo.InvariantCulture,
                        "{0} - Service could not induce data loss for service '{1}' partition '{2}' in '{3}' Please retry",
                        this.State.OperationId,
                        partitionSelector.ServiceName,
                        partitionId,
                        this.dataLossCheckWaitDurationInSeconds);
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error);
                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);

                return(state);
            }
Example #21
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                NodeCommandState state = Convert(this.State);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StopNode.LookingUpState performing node query", this.State.OperationId);

                Node queriedNode = await FaultAnalysisServiceUtility.GetNodeInfoAsync(
                    this.State.OperationId,
                    this.FabricClient,
                    state.Info.NodeName,
                    this.action.Partition,
                    this.action.StateManager,
                    this.action.StoppedNodeTable,
                    this.RequestTimeout,
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StopNode.LookingUpState node query completed", this.State.OperationId);

                // Check for bad state
                if (queriedNode == null ||
                    queriedNode.NodeStatus == NodeStatus.Invalid ||
                    queriedNode.NodeStatus == NodeStatus.Unknown ||
                    queriedNode.NodeStatus == NodeStatus.Removed)
                {
                    // Fail the command
                    Exception nodeNotFoundException = FaultAnalysisServiceUtility.CreateException(
                        TraceType,
                        Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_NOT_FOUND,
                        string.Format(CultureInfo.InvariantCulture, "{0} - Node {1} does not exist", this.State.OperationId, state.Info.NodeName));
                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - throwing fatal exception {1}", this.State.OperationId, nodeNotFoundException);
                    throw new FatalException("fatal", nodeNotFoundException);
                }

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StopNode LookingUpState reading RD", this.State.OperationId);
                bool isStopped = await FaultAnalysisServiceUtility.ReadStoppedNodeStateAsync(
                    this.State.OperationId,
                    this.action.Partition,
                    this.action.StateManager,
                    this.action.StoppedNodeTable,
                    state.Info.NodeName,
                    cancellationToken).ConfigureAwait(false);

                if (queriedNode.NodeStatus == NodeStatus.Down && isStopped)
                {
                    // Node already stopped
                    Exception nodeAlreadyStopped = FaultAnalysisServiceUtility.CreateException(
                        TraceType,
                        Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_ALREADY_STOPPED,
                        string.Format(CultureInfo.InvariantCulture, "Node {0} is already stopped", state.Info.NodeName));
                    throw new FatalException("fatal", nodeAlreadyStopped);
                }
                else if (queriedNode.NodeStatus != NodeStatus.Down && isStopped)
                {
                    // FM says the node is up, so FAS has incorrect state, perhaps because of an out of band start from the original deprecated api.
                    // Correct the state, then continue to run this command normally.  It is valid.
                    await FaultAnalysisServiceUtility.SetStoppedNodeStateAsync(
                        this.action.State.OperationId,
                        this.action.Partition,
                        this.action.StateManager,
                        this.action.StoppedNodeTable,
                        queriedNode.NodeName,
                        false,
                        cancellationToken).ConfigureAwait(false);
                }
                else if (queriedNode.NodeStatus == NodeStatus.Down && !isStopped)
                {
                    // Node is down (as opposed to stopped)
                    Exception nodeIsDown = FaultAnalysisServiceUtility.CreateException(
                        TraceType,
                        Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_IS_DOWN,
                        string.Format(CultureInfo.InvariantCulture, "Node {0} is down", state.Info.NodeName));
                    throw new FatalException("fatal", nodeIsDown);
                }

                state.Info.InitialQueriedNodeStatus       = queriedNode.NodeStatus;
                state.Info.NodeWasInitiallyInStoppedState = isStopped;
                TestabilityTrace.TraceSource.WriteInfo(
                    StepBase.TraceType,
                    "{0} - StopNode LookingUpState InitialQueriedNodeStatus='{1}', NodeWasInitiallyInStoppedState='{2}'",
                    this.State.OperationId,
                    state.Info.InitialQueriedNodeStatus,
                    state.Info.NodeWasInitiallyInStoppedState);

                state.StateProgress.Push(StepStateNames.PerformingActions);
                return(state);
            }
Example #22
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeDataLossState state = Convert(this.State);

                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        this.partitionSelector.ServiceName,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful)
                {
                    // The message in the first arg is only for debugging, it is not returned to the user.
                    throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices);
                }

                int targetReplicaSetSize = (result as StatefulServiceDescription).TargetReplicaSetSize;

                SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync(
                    this.FabricClient,
                    this.partitionSelector,
                    this.RequestTimeout,
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                Guid partitionId = targetPartition.PartitionId;

                long preDataLossNumber = 0;

                ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetPartitionListAsync(
                        this.partitionSelector.ServiceName,
                        null,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                bool partitionFound = false;

                foreach (StatefulServicePartition partition in partitionsResult)
                {
                    if (partition.PartitionInformation.Id == partitionId)
                    {
                        preDataLossNumber = partition.PrimaryEpoch.DataLossNumber;
                        partitionFound    = true;
                        break;
                    }
                }

                if (!partitionFound)
                {
                    throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound);
                }

                ServiceReplicaList failoverManagerReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        FASConstants.FmPartitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                string failoverManagerPrimaryNodeName = string.Empty;
                var    readyFMReplicas = failoverManagerReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();

                foreach (var replica in readyFMReplicas)
                {
                    StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                    ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica");
                    if (statefulReplica.ReplicaRole == ReplicaRole.Primary)
                    {
                        failoverManagerPrimaryNodeName = replica.NodeName;
                    }
                }

                if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName))
                {
                    throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady);
                }

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary location={1}", this.State.OperationId, failoverManagerPrimaryNodeName);
                string behaviorName = "BlockDoReconfiguration_" + this.State.OperationId;
                List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >();

                unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName));

                state.StateProgress.Push(StepStateNames.PerformingActions);
                state.Info.DataLossNumber          = preDataLossNumber;
                state.Info.NodeName                = failoverManagerPrimaryNodeName;
                state.Info.PartitionId             = partitionId;
                state.Info.UnreliableTransportInfo = unreliableTransportInfo;
                state.Info.TargetReplicaSetSize    = targetReplicaSetSize;
                return(state);
            }
Example #23
0
 public abstract Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo);
        public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Inside TestRetryStepStepOne", this.State.OperationId);

            // Simulate work
            await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false);

            this.State.StateProgress.Push(StepStateNames.PerformingActions);
            return(this.State);
        }
Example #25
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                NodeCommandState state = Convert(this.State);

                Node queriedNode = await FaultAnalysisServiceUtility.GetNodeInfoAsync(
                    this.State.OperationId,
                    this.FabricClient,
                    state.Info.NodeName,
                    this.action.Partition,
                    this.action.StateManager,
                    this.action.StoppedNodeTable,
                    this.RequestTimeout,
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StartNode LookingUpState reading RD", this.State.OperationId);
                bool isStopped = await FaultAnalysisServiceUtility.ReadStoppedNodeStateAsync(
                    this.State.OperationId,
                    this.action.Partition,
                    this.action.StateManager,
                    this.action.StoppedNodeTable,
                    state.Info.NodeName,
                    cancellationToken).ConfigureAwait(false);

                if (FaultAnalysisServiceUtility.IsNodeRunning(queriedNode))
                {
                    if (!isStopped)
                    {
                        // For illustration, if you just called StartNodeUsingNodeNameAsync() in this situation w/o checking first, you'd either get instance mismatch or node has not stopped yet
                        // Note: this is different than the logic in the PerformingActions step (the former does not check instance id, the latter does), which is after the call to StartNodeUsingNodeNameAsync(), because
                        // this is a precondition check.
                        Exception nodeAlreadyUp = FaultAnalysisServiceUtility.CreateException(
                            TraceType,
                            NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_IS_UP,
                            string.Format(CultureInfo.InvariantCulture, "Node {0} already started", state.Info.NodeName),
                            FabricErrorCode.NodeIsUp);

                        throw new FatalException("fatal", nodeAlreadyUp);
                    }
                    else
                    {
                        // The only way this can happen is OOB start.  FAS should fix it's incorrect state then fail the command with
                        // node already up.
                        TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StartNode LookingUpState setting RD entry for node {1} to not stopped", this.State.OperationId, state.Info.NodeName);
                        await FaultAnalysisServiceUtility.SetStoppedNodeStateAsync(
                            this.action.State.OperationId,
                            this.action.Partition,
                            this.action.StateManager,
                            this.action.StoppedNodeTable,
                            queriedNode.NodeName,
                            false,
                            cancellationToken).ConfigureAwait(false);

                        Exception nodeIsUp = FaultAnalysisServiceUtility.CreateException(
                            TraceType,
                            Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_IS_UP,
                            string.Format(CultureInfo.InvariantCulture, "Node {0} is up", state.Info.NodeName));
                        throw new FatalException("fatal", nodeIsUp);
                    }
                }
                else if (queriedNode.NodeStatus == NodeStatus.Down && !isStopped)
                {
                    // This is a special scenario that can happen if:
                    // 1)  There was an OOB stop using the old api
                    // 2)  A node went down (not stopped, down)
                    // Don't handle this, return node down.
                    Exception nodeIsDown = FaultAnalysisServiceUtility.CreateException(
                        TraceType,
                        Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_IS_DOWN,
                        string.Format(CultureInfo.InvariantCulture, "Node {0} is down", state.Info.NodeName));
                    throw new FatalException("fatal", nodeIsDown);
                }

                state.Info.InitialQueriedNodeStatus       = queriedNode.NodeStatus;
                state.Info.NodeWasInitiallyInStoppedState = isStopped;

                state.StateProgress.Push(StepStateNames.PerformingActions);

                return(state);
            }
Example #26
0
        public Task InsertCommandAsync(Command command)
        {
            Uri failoverManagerUri = new Uri("fabric:/System/FailoverManagerService");

            TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "****Adding command: " + command);
            Task task = null;

            if (command == Command.FailoverManagerDataLoss)
            {
                PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri);
                Guid id = MockClientCommandInfo[Command.FailoverManagerDataLoss];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, null);

                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.InvokeDataLossMidActionTestFatal)
            {
                PartitionSelector        ps        = PartitionSelector.SingletonOf(failoverManagerUri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackAction);
                Guid id = MockClientCommandInfo[Command.InvokeDataLossMidActionTestFatal];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed);
            }
            else if (command == Command.InvokeDataLossMidActionTestTransient)
            {
                // rollback and retry then success
                PartitionSelector        ps        = PartitionSelector.SingletonOf(failoverManagerUri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackActionAndRetry);
                Guid id = MockClientCommandInfo[Command.InvokeDataLossMidActionTestTransient];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.InvokeDataLossMidActionTestFailover)
            {
                // failover then success
                PartitionSelector        ps        = PartitionSelector.SingletonOf(failoverManagerUri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.KillProcess);
                Guid id = MockClientCommandInfo[Command.InvokeDataLossMidActionTestFailover];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.FailoverManagerDataLossCauseActionRollbackFatal)
            {
                PartitionSelector        ps        = PartitionSelector.SingletonOf(failoverManagerUri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.CompletedSuccessfully, ServiceInternalFaultType.RollbackAction);
                Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossCauseActionRollbackFatal];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed);
            }
            else if (command == Command.FailoverManagerDataLossCauseActionRollbackWithSuccessOnRetry)
            {
                PartitionSelector        ps        = PartitionSelector.SingletonOf(failoverManagerUri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.CompletedSuccessfully, ServiceInternalFaultType.RollbackActionAndRetry);
                Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossCauseActionRollbackWithSuccessOnRetry];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.FailoverManagerDataLossFailoverFaultAnalysisService)
            {
                PartitionSelector        ps        = PartitionSelector.SingletonOf(failoverManagerUri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.PerformingActions, ServiceInternalFaultType.KillProcess);
                Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossFailoverFaultAnalysisService];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.FailoverManagerDataLossCauseActionRollbackFatalBeforeActionStep)
            {
                PartitionSelector        ps        = PartitionSelector.SingletonOf(failoverManagerUri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackAction);
                Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossCauseActionRollbackFatalBeforeActionStep];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed);
            }
            else if (command == Command.FailoverManagerDataLossCauseActionRollbackWithSuccessOnRetryBeforeActionStep)
            {
                PartitionSelector        ps        = PartitionSelector.SingletonOf(failoverManagerUri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackActionAndRetry);
                Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossCauseActionRollbackWithSuccessOnRetryBeforeActionStep];
                task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.InvokeQuorumLossMidActionTestFatal)
            {
                Uri uri = new Uri("fabric:/System/NamingService");
                PartitionSelector        ps        = PartitionSelector.RandomOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackAction);
                Guid id = MockClientCommandInfo[Command.InvokeQuorumLossMidActionTestFatal];
                task = this.messageProcessor.ProcessQuorumLossCommandAsync(id, ps, QuorumLossMode.AllReplicas, TimeSpan.FromSeconds(10.0d), FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed);
            }
            else if (command == Command.InvokeQuorumLossMidActionTestFailover)
            {
                Uri uri = new Uri("fabric:/System/NamingService");
                PartitionSelector        ps        = PartitionSelector.RandomOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.KillProcess);
                Guid id = MockClientCommandInfo[Command.InvokeQuorumLossMidActionTestFailover];
                task = this.messageProcessor.ProcessQuorumLossCommandAsync(id, ps, QuorumLossMode.AllReplicas, TimeSpan.FromSeconds(10.0d), FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.InvokeQuorumLossMidActionTestTransient)
            {
                Uri uri = new Uri("fabric:/System/NamingService");
                PartitionSelector        ps        = PartitionSelector.RandomOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackActionAndRetry);
                Guid id = MockClientCommandInfo[Command.InvokeQuorumLossMidActionTestTransient];
                task = this.messageProcessor.ProcessQuorumLossCommandAsync(id, ps, QuorumLossMode.AllReplicas, TimeSpan.FromSeconds(10.0d), FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.RestartPartitionMidActionTestFatal)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackAction);
                Guid id = MockClientCommandInfo[Command.RestartPartitionMidActionTestFatal];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed);
            }
            else if (command == Command.RestartPartitionMidActionTestFailover)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.KillProcess);
                Guid id = MockClientCommandInfo[Command.RestartPartitionMidActionTestFailover];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.RestartPartitionMidActionTestTransient)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackActionAndRetry);
                Guid id = MockClientCommandInfo[Command.RestartPartitionMidActionTestTransient];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully);
            }
            else if (command == Command.StuckAction)
            {
                Guid id = MockClientCommandInfo[Command.StuckAction];
                task = this.messageProcessor.ProcessStuckCommandAsync(id, null);
            }
            else if (command == Command.RestartPartitionCancelOuterLoopNoForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.None, // for this case this value does not matter
                    ServiceInternalFaultType.None,
                    RollbackState.RollingBackDueToUserCancel,
                    FASConstants.OuterLoop);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelOuterLoopNoForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel);
            }
            else if (command == Command.RestartPartitionCancelForwardNoForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.None,
                    ServiceInternalFaultType.None,
                    RollbackState.RollingBackDueToUserCancel,
                    FASConstants.ForwardLoop);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelForwardNoForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel);
            }
            else if (command == Command.RestartPartitionCancelForwardExceptionNoForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.MidPerformingActions,
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackDueToUserCancel,
                    FASConstants.ForwardLoopExceptionBlock);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelForwardExceptionNoForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel);
            }
            else if (command == Command.RestartPartitionCancelOuterCleanupNoForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.PerformingActions,
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackDueToUserCancel,
                    FASConstants.OuterCleanupLoop);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelOuterCleanupNoForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel);
            }
            else if (command == Command.RestartPartitionCancelCleanupInnerNoForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.PerformingActions,
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackDueToUserCancel,
                    FASConstants.InnerCleanupLoop);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelCleanupInnerNoForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel);
            }
            else if (command == Command.RestartPartitionCancelOuterLoopForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.PerformingActions, // for this case, this value does not matter
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackForce,
                    FASConstants.ForwardLoop);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelOuterLoopForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce);
            }
            else if (command == Command.RestartPartitionCancelForwardForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.PerformingActions,
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackForce,
                    FASConstants.ForwardLoop);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelForwardForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce);
            }
            else if (command == Command.RestartPartitionCancelForwardExceptionForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.PerformingActions,
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackForce,
                    FASConstants.ForwardLoopExceptionBlock);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelForwardExceptionForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce);
            }
            else if (command == Command.RestartPartitionCancelOuterCleanupForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.PerformingActions,
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackForce,
                    FASConstants.OuterCleanupLoop);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelOuterCleanupForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce);
            }
            else if (command == Command.RestartPartitionCancelCleanupInnerForce)
            {
                Uri uri = new Uri("fabric:/System/ClusterManagerService");
                PartitionSelector        ps        = PartitionSelector.SingletonOf(uri);
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.PerformingActions,
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackForce,
                    FASConstants.InnerCleanupLoop);
                Guid id = MockClientCommandInfo[Command.RestartPartitionCancelCleanupInnerForce];
                task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce);
            }
            else if (command == Command.TestRetryStepWithSuccessAfterRetries)
            {
                // Intentionally fail the step corresponding to StepStateNames.PerformingActions step a few times, then run it normally (pass).  It should succeed.
                Guid id = MockClientCommandInfo[Command.TestRetryStepWithSuccessAfterRetries];
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.MidPerformingActions,
                    ServiceInternalFaultType.ThrowThreeTimes);

                task = this.messageProcessor.ProcessRetryStepCommandAsync(id, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully, RollbackState.NotRollingBack);
            }
            else if (command == Command.TestRetryStepWithForceCancel)
            {
                // Force cancel a command with ActionStateBase.RetryStepWithoutRollingBackOnFailure set to true
                Guid id = MockClientCommandInfo[Command.TestRetryStepWithForceCancel];
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.CompletedSuccessfully, // this just has to be a late step so an earlier fault is not used before we reach the situation we want.
                    ServiceInternalFaultType.RollbackAction,
                    RollbackState.RollingBackForce,       // note, the graceful one should not cause cancellation since for this type we only allow user cancellation when force is true
                    FASConstants.InnerForwardLoop,
                    StepStateNames.PerformingActions);
                task = this.messageProcessor.ProcessRetryStepCommandAsync(id, faultInfo);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce);
            }
            else if (command == Command.StopNodeWithUnknownException)
            {
                Guid id = MockClientCommandInfo[Command.StopNodeWithUnknownException];
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.MidPerformingActions,
                    ServiceInternalFaultType.RollbackAction);  // In this case, since start and stop node do not rollback like other commands, this exception should cause the step to retry.

                Node target = ActionTest.GetNodeWithFASSecondary().Result;
                TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} stopping {1}:{2}", id, target.NodeName, target.NodeInstanceId);
                task = this.messageProcessor.ProcessStopNodeCommandAsync(id, target.NodeName, target.NodeInstanceId, 999, FASConstants.DefaultTestTimeout, faultInfo);

                // Let the command make progress
                Task.Delay(TimeSpan.FromSeconds(30)).Wait();
                this.WaitForState(id, StepStateNames.PerformingActions, RollbackState.NotRollingBack);

                // This should not result in cancellation, since start and stop node have different rollback policies than the other commands.
                TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} - cancelling with force==false.  This should not cancel the command", id);
                this.messageProcessor.CancelTestCommandAsync(id, false);

                this.WaitForState(id, StepStateNames.PerformingActions, RollbackState.RollingBackDueToUserCancel);

                // Now force cancel.  This should cancel.
                TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} - cancelling with force==true.  This should cancel the command", id);
                this.messageProcessor.CancelTestCommandAsync(id, true);
                this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce);

                NodeList nodes = ActionTest.GetNodeListAsync().Result;
                TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} - node info:", id);
                foreach (Node n in nodes)
                {
                    TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "    OperationId:{0} - NodeName{1}, NodeStatus:{2}, IsStopped:{3}", id, n.NodeName, n.NodeStatus, n.IsStopped);
                }

                Node targetNodeAfterTest = nodes.Where(n => n.NodeName == target.NodeName).FirstOrDefault();
                if (targetNodeAfterTest == null)
                {
                    throw new InvalidOperationException("target node was not found in query after test");
                }

                if (targetNodeAfterTest.IsStopped == false)
                {
                    throw new InvalidOperationException("target node should have IsStopped true, was false");
                }
            }
            else if (command == Command.StopNodeWithExceptionAndSuccessAfterRetries)
            {
                Guid id = MockClientCommandInfo[Command.StopNodeWithExceptionAndSuccessAfterRetries];

                // Inject a fault during the operation so that step "StepStateNames.MidPerformingActions" has to retry 3 times before succeeding
                ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(
                    StepStateNames.MidPerformingActions,
                    ServiceInternalFaultType.ThrowThreeTimes);

                Node target = ActionTest.GetNodeWithFASSecondary().Result;
                TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} stopping {1}:{2}", id, target.NodeName, target.NodeInstanceId);
                task = this.messageProcessor.ProcessStopNodeCommandAsync(id, target.NodeName, target.NodeInstanceId, 999, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully, RollbackState.NotRollingBack);

                // Start the stopped node
                task = this.messageProcessor.ProcessStartNodeCommandAsync(Guid.NewGuid(), target.NodeName, target.NodeInstanceId, FASConstants.DefaultTestTimeout, faultInfo);
                this.WaitForState(id, StepStateNames.CompletedSuccessfully, RollbackState.NotRollingBack);
            }
            else
            {
                ReleaseAssert.Failfast("Unexpected command");
            }

            return(task);
        }
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeQuorumLossState state = Convert(this.State);

                Guid partitionId = state.Info.PartitionId;
                List <Tuple <string, string> > unreliableTransportInfo = state.Info.UnreliableTransportInfo;
                List <long> targetReplicas = state.Info.ReplicaIds;

                var unreliableTransportTaskList = new List <Task>();
                List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >();

                foreach (Tuple <string, string> ut in unreliableTransportInfo)
                {
                    string nodeName     = ut.Item1;
                    string behaviorName = ut.Item2;

                    System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "StatefulServiceReopen");
                    behavior.AddFilterForPartitionId(partitionId);

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying '{1}'", this.State.OperationId, behaviorName);

                    unreliableTransportTaskList.Add(FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                                                        () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                                                            nodeName,
                                                            behaviorName,
                                                            behavior,
                                                            this.RequestTimeout,
                                                            cancellationToken),
                                                        this.OperationTimeout,
                                                        cancellationToken));
                }

                await Task.WhenAll(unreliableTransportTaskList).ConfigureAwait(false);

                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                List <Task> tasks = new List <Task>();

                foreach (long replicaId in targetReplicas)
                {
                    ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(this.partitionSelector.ServiceName, partitionId), replicaId);

                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - faulting replica with id={1}", this.State.OperationId, replicaId);
                    Task task = FaultAnalysisServiceUtility.RestartReplicaAsync(this.FabricClient, replicaSelector, CompletionMode.DoNotVerify, this.RequestTimeout, this.OperationTimeout, cancellationToken);
                    tasks.Add(task);
                }

                await Task.WhenAll(tasks).ConfigureAwait(false);

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - keeping partition in quorum loss for '{1}'", this.State.OperationId, state.Info.QuorumLossDuration);
                await Task.Delay(state.Info.QuorumLossDuration, cancellationToken).ConfigureAwait(false);

                TimeoutHelper timeoutHelper = new TimeoutHelper(this.OperationTimeout);

                bool conditionSatisfied = false;

                int quorumLossCheckRetries = FASConstants.QuorumLossCheckRetryCount;

                do
                {
                    TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - checking PartitionStatus", this.State.OperationId);
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.QueryManager.GetPartitionListAsync(
                            this.partitionSelector.ServiceName,
                            null,
                            this.RequestTimeout,
                            cancellationToken),
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);

                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            if (partition.PartitionStatus == ServicePartitionStatus.InQuorumLoss)
                            {
                                conditionSatisfied = true;
                                break;
                            }
                        }
                    }

                    await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false);
                }while (!conditionSatisfied && quorumLossCheckRetries-- > 0);

                if (!conditionSatisfied)
                {
                    string error = string.Format(CultureInfo.InvariantCulture, "{0} - Service could not induce quorum loss for service '{1}', partition '{2}'. Please retry", this.State.OperationId, this.partitionSelector.ServiceName, partitionId);
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error);

                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                await QuorumLossStepsFactory.RemoveUTAsync(this.FabricClient, this.State, this.RequestTimeout, this.OperationTimeout, cancellationToken);

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);

                return(state);
            }
Example #28
0
 public NodeCommandState(ActionType actionType, Guid operationId, NodeCommandSynchronizer nodeSync, ServiceInternalFaultInfo serviceInternalFaultInfo, string nodeName, BigInteger nodeInstanceId, int stopDurationInSeconds)
     : base(operationId, actionType, serviceInternalFaultInfo)
 {
     this.Info     = new NodeCommandInfo(nodeName, nodeInstanceId, stopDurationInSeconds);
     this.NodeSync = nodeSync;
     this.RetryStepWithoutRollingBackOnFailure = true;
 }