Exemplo n.º 1
0
        public static StepBase GetStep(
            StepStateNames stateName,
            FabricClient fabricClient,
            ActionStateBase actionState,
            InvokeDataLossAction action,
            TimeSpan requestTimeout,
            TimeSpan operationTimeout,
            CancellationToken cancellationToken)
        {
            StepBase            step = null;
            InvokeDataLossState invokeDataLossState = Convert(actionState);

            StepStateNames prevContext = actionState.StateProgress.Peek();

            if (stateName == StepStateNames.LookingUpState)
            {
                step = new DataLossStepsFactory.LookingUpState(fabricClient, invokeDataLossState, requestTimeout, operationTimeout, action.PartitionSelector);
            }
            else if (stateName == StepStateNames.PerformingActions)
            {
                step = new DataLossStepsFactory.PerformingActions(fabricClient, invokeDataLossState, requestTimeout, operationTimeout, action.PartitionSelector, action.DataLossCheckWaitDurationInSeconds, action.DataLossCheckPollIntervalInSeconds, action.ReplicaDropWaitDurationInSeconds);
            }
            else if (stateName == StepStateNames.CompletedSuccessfully)
            {
                // done - but then this method should not have been called
                ReleaseAssert.Failfast(string.Format(CultureInfo.InvariantCulture, "{0} - GetStep() should not have been called when the state name is CompletedSuccessfully"), actionState.OperationId);
            }
            else
            {
                ReleaseAssert.Failfast(string.Format(CultureInfo.InvariantCulture, "{0} - Unexpected state name={1}", actionState.OperationId, stateName));
            }

            return(step);
        }
Exemplo n.º 2
0
 public PerformingActions(FabricClient fabricClient, InvokeDataLossState state, TimeSpan requestTimeout, TimeSpan operationTimeout, PartitionSelector partitionSelector, int dataLossCheckWaitDurationInSeconds, int dataLossCheckPollIntervalInSeconds, int replicaDropWaitDurationInSeconds)
     : base(fabricClient, state, requestTimeout, operationTimeout)
 {
     this.partitionSelector = partitionSelector;
     this.dataLossCheckWaitDurationInSeconds = dataLossCheckWaitDurationInSeconds;
     this.dataLossCheckPollIntervalInSeconds = dataLossCheckPollIntervalInSeconds;
     this.replicaDropWaitDurationInSeconds   = replicaDropWaitDurationInSeconds;
 }
Exemplo n.º 3
0
            public override Task CleanupAsync(CancellationToken cancellationToken)
            {
                // debug - remove later
                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Enter Cleanup for PerformingAction", this.State.OperationId);
                InvokeDataLossState state = Convert(this.State);

                string behaviorName = state.Info.UnreliableTransportInfo.First().Item2;

                return(RemoveUnreliableTransportAsync(this.FabricClient, state.Info.NodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken));
            }
Exemplo n.º 4
0
        public static InvokeDataLossState Convert(ActionStateBase actionState)
        {
            InvokeDataLossState invokeDataLossState = actionState as InvokeDataLossState;

            if (invokeDataLossState == null)
            {
                throw new InvalidCastException("State object could not be converted");
            }

            return(invokeDataLossState);
        }
        public async Task <PartitionDataLossProgress> GetInvokeDataLossProgressAsync(
            Guid operationId,
            TimeSpan timeout,
            CancellationToken cancellationToken)
        {
            this.ThrowIfNotReady();
            PartitionDataLossProgress progress = null;

            TestabilityTrace.TraceSource.WriteInfo(TraceType, "Inside GetInvokeDataLossProgressAsync, operationId = {0}", operationId);

            try
            {
                ActionStateBase actionState = await this.MessageProcessor.ProcessGetProgressAsync(operationId, timeout, cancellationToken);

                StepStateNames stateName = actionState.StateProgress.Peek();

                TestCommandProgressState state = FaultAnalysisServiceUtility.ConvertState(actionState, TraceType);

                InvokeDataLossState invokeDataLossState = actionState as InvokeDataLossState;
                if (invokeDataLossState == null)
                {
                    throw new InvalidCastException("State object could not be converted");
                }

                StepStateNames stepState         = actionState.StateProgress.Peek();
                var            selectedPartition = new SelectedPartition
                {
                    ServiceName = invokeDataLossState.Info.PartitionSelector.ServiceName,
                    PartitionId = invokeDataLossState.Info.PartitionId
                };

                PartitionDataLossResult result = new PartitionDataLossResult(selectedPartition, actionState.ErrorCausingRollback);

                progress = new PartitionDataLossProgress(state, result);
                TestabilityTrace.TraceSource.WriteInfo(
                    TraceType,
                    "{0} - {1} progress - {2}, Exception - {3}",
                    operationId,
                    ActionType.InvokeDataLoss,
                    progress.Result != null ? progress.Result.SelectedPartition.ToString() : FASConstants.UnavailableMessage,
                    (progress.Result != null && progress.Result.Exception != null) ? progress.Result.Exception.ToString() : FASConstants.UnavailableMessage);
            }
            catch (Exception e)
            {
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Caught {1}", operationId, e.ToString());
                FaultAnalysisServiceUtility.ThrowTransientExceptionIfRetryable(e);

                throw;
            }

            return(progress);
        }
        // Use this method signature for now until the actual client interface is decided
        public async Task ProcessDataLossCommandAsync(Guid operationId, PartitionSelector partitionSelector, DataLossMode dataLossMode, TimeSpan timeout, ServiceInternalFaultInfo serviceInternalFaultInfo)
        {
            ThrowIfDataLossModeInvalid(dataLossMode);

            ActionStateBase actionState = new InvokeDataLossState(operationId, serviceInternalFaultInfo, partitionSelector, dataLossMode);

            try
            {
                // After this call finishes the intent has been persisted
                await this.actionStore.InitializeNewActionAsync(actionState, timeout);

                this.Enqueue(actionState);
            }
            catch (Exception e)
            {
                TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e);
                throw;
            }
        }
Exemplo n.º 7
0
        private ActionStateBase ReadData(byte[] bytes)
        {
            ActionStateBase result = null;

            using (BinaryReader br = new BinaryReader(new MemoryStream(bytes)))
            {
                // The first 4 bytes are the command type
                ActionType a = ActionStateBase.ReadCommandType(br);

                if (a == ActionType.InvokeDataLoss)
                {
                    result = InvokeDataLossState.FromBytes(br);
                }
                else if (a == ActionType.InvokeQuorumLoss)
                {
                    result = InvokeQuorumLossState.FromBytes(br);
                }
                else if (a == ActionType.RestartPartition)
                {
                    result = RestartPartitionState.FromBytes(br);
                }
                else if (a == ActionType.TestStuck)
                {
                    result = StuckState.FromBytes(br);
                }
                else if (a == ActionType.TestRetryStep)
                {
                    result = TestRetryStepState.FromBytes(br);
                }
                else if (a == ActionType.StartNode)
                {
                    result = NodeCommandState.FromBytes(br, a);
                }
                else if (a == ActionType.StopNode)
                {
                    result = NodeCommandState.FromBytes(br, a);
                }
            }

            return(result);
        }
Exemplo n.º 8
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeDataLossState state = Convert(this.State);

                ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync(
                        this.partitionSelector.ServiceName,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                if (result.Kind != ServiceDescriptionKind.Stateful)
                {
                    // The message in the first arg is only for debugging, it is not returned to the user.
                    throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices);
                }

                int targetReplicaSetSize = (result as StatefulServiceDescription).TargetReplicaSetSize;

                SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync(
                    this.FabricClient,
                    this.partitionSelector,
                    this.RequestTimeout,
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                Guid partitionId = targetPartition.PartitionId;

                long preDataLossNumber = 0;

                ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetPartitionListAsync(
                        this.partitionSelector.ServiceName,
                        null,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                bool partitionFound = false;

                foreach (StatefulServicePartition partition in partitionsResult)
                {
                    if (partition.PartitionInformation.Id == partitionId)
                    {
                        preDataLossNumber = partition.PrimaryEpoch.DataLossNumber;
                        partitionFound    = true;
                        break;
                    }
                }

                if (!partitionFound)
                {
                    throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound);
                }

                ServiceReplicaList failoverManagerReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        FASConstants.FmPartitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                string failoverManagerPrimaryNodeName = string.Empty;
                var    readyFMReplicas = failoverManagerReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray();

                foreach (var replica in readyFMReplicas)
                {
                    StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                    ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica");
                    if (statefulReplica.ReplicaRole == ReplicaRole.Primary)
                    {
                        failoverManagerPrimaryNodeName = replica.NodeName;
                    }
                }

                if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName))
                {
                    throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady);
                }

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary location={1}", this.State.OperationId, failoverManagerPrimaryNodeName);
                string behaviorName = "BlockDoReconfiguration_" + this.State.OperationId;
                List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >();

                unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName));

                state.StateProgress.Push(StepStateNames.PerformingActions);
                state.Info.DataLossNumber          = preDataLossNumber;
                state.Info.NodeName                = failoverManagerPrimaryNodeName;
                state.Info.PartitionId             = partitionId;
                state.Info.UnreliableTransportInfo = unreliableTransportInfo;
                state.Info.TargetReplicaSetSize    = targetReplicaSetSize;
                return(state);
            }
Exemplo n.º 9
0
 public LookingUpState(FabricClient fabricClient, InvokeDataLossState state, TimeSpan requestTimeout, TimeSpan operationTimeout, PartitionSelector partitionSelector)
     : base(fabricClient, state, requestTimeout, operationTimeout)
 {
     this.partitionSelector = partitionSelector;
 }
Exemplo n.º 10
0
            public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo)
            {
                InvokeDataLossState state = Convert(this.State);

                PartitionSelector partitionSelector   = state.Info.PartitionSelector;
                DataLossMode      dataLossMode        = state.Info.DataLossMode;
                long   preDataLossNumber              = state.Info.DataLossNumber;
                string failoverManagerPrimaryNodeName = state.Info.NodeName;
                Guid   partitionId          = state.Info.PartitionId;
                string behaviorName         = state.Info.UnreliableTransportInfo.First().Item2;
                int    targetReplicaSetSize = state.Info.TargetReplicaSetSize;

                TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying UT, partitionId={1}", this.State.OperationId, partitionId);
                System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration");
                behavior.AddFilterForPartitionId(partitionId);

                await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync(
                        failoverManagerPrimaryNodeName,
                        behaviorName,
                        behavior,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                // TODO: Wait for some time so that the unreliable transport behavior can be read from the files.
                // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied
                await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false);

                ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                    () => this.FabricClient.QueryManager.GetReplicaListAsync(
                        partitionId,
                        0,
                        this.RequestTimeout,
                        cancellationToken),
                    this.OperationTimeout,
                    cancellationToken).ConfigureAwait(false);

                List <StatefulServiceReplica> replicaList = new List <StatefulServiceReplica>();

                foreach (var replica in replicasResult)
                {
                    StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica;
                    ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful");
                    replicaList.Add(statefulReplica);
                }

                // Select target replicas based on the DataLosMode
                List <StatefulServiceReplica> targets = null;

                if (dataLossMode == DataLossMode.FullDataLoss)
                {
                    targets = GetReplicasForFullDataLoss(replicaList);
                }
                else if (dataLossMode == DataLossMode.PartialDataLoss)
                {
                    targets = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, replicaList);
                }
                else
                {
                    throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode);
                }

                if (targets == null)
                {
                    // This will cause the command to rollback and retry
                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                foreach (var replica in targets)
                {
                    TestabilityTrace.TraceSource.WriteInfo(
                        StepBase.TraceType,
                        "{0} - Removing replica {1} in partition {2} with role {3} and status {4} to induce data loss",
                        this.State.OperationId,
                        replica.Id,
                        partitionId,
                        replica.ReplicaRole,
                        replica.ReplicaStatus);

                    await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.ServiceManager.RemoveReplicaAsync(
                            replica.NodeName,
                            partitionId,
                            replica.Id,
                            this.RequestTimeout,
                            cancellationToken),
                        FabricClientRetryErrors.RemoveReplicaErrors.Value,
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);
                }

                ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true);

                await this.WaitForAllTargetReplicasToGetDroppedAsync(partitionId, targets, cancellationToken).ConfigureAwait(false);

                await RemoveUnreliableTransportAsync(this.FabricClient, failoverManagerPrimaryNodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false);

                bool          dataLossWasSuccessful = false;
                TimeoutHelper timeoutHelper         = new TimeoutHelper(TimeSpan.FromSeconds(30));

                do
                {
                    ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync(
                        () => this.FabricClient.QueryManager.GetPartitionListAsync(
                            this.partitionSelector.ServiceName,
                            null,
                            this.RequestTimeout,
                            cancellationToken),
                        this.OperationTimeout,
                        cancellationToken).ConfigureAwait(false);

                    bool partitionFound     = false;
                    long postDataLossNumber = 0;
                    foreach (StatefulServicePartition partition in partitionsResult)
                    {
                        if (partition.PartitionInformation.Id == partitionId)
                        {
                            postDataLossNumber = partition.PrimaryEpoch.DataLossNumber;
                            partitionFound     = true;
                            break;
                        }
                    }

                    if (!partitionFound)
                    {
                        throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound);
                    }

                    TestabilityTrace.TraceSource.WriteInfo(
                        StepBase.TraceType,
                        "{0} - Checking data loss numbers for partition {1} with remaining time {2}. Current numbers {3}:{4}",
                        this.State.OperationId,
                        partitionId,
                        timeoutHelper.GetRemainingTime(),
                        preDataLossNumber,
                        postDataLossNumber);

                    if (postDataLossNumber != preDataLossNumber)
                    {
                        dataLossWasSuccessful = true;
                        break;
                    }

                    await System.Fabric.Common.AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(this.dataLossCheckPollIntervalInSeconds), cancellationToken).ConfigureAwait(false);
                }while (timeoutHelper.GetRemainingTime() > TimeSpan.Zero);

                if (!dataLossWasSuccessful)
                {
                    // This is only viewable internally for debug.  This will cause a retry of the whole flow.
                    string error = string.Format(
                        CultureInfo.InvariantCulture,
                        "{0} - Service could not induce data loss for service '{1}' partition '{2}' in '{3}' Please retry",
                        this.State.OperationId,
                        partitionSelector.ServiceName,
                        partitionId,
                        this.dataLossCheckWaitDurationInSeconds);
                    TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error);
                    throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady);
                }

                state.StateProgress.Push(StepStateNames.CompletedSuccessfully);

                return(state);
            }
        private async Task <FabricTestAction> ConstructActionAsync(ActionType actionType, ActionStateBase actionStateBase)
        {
            FabricTestAction action = null;

            if (actionType == ActionType.InvokeDataLoss)
            {
                InvokeDataLossState actionState = actionStateBase as InvokeDataLossState;

                StepStateNames currentState = actionState.StateProgress.Peek();
                if (currentState == StepStateNames.IntentSaved)
                {
                    actionState.StateProgress.Push(StepStateNames.LookingUpState);
                    await this.actionStore.UpdateActionStateAsync(actionState);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated");
                }

                action = new InvokeDataLossAction(
                    this.stateManager,
                    this.Partition,
                    actionState,
                    actionState.Info.PartitionSelector,
                    actionState.Info.DataLossMode,
                    this.dataLossCheckWaitDurationInSeconds,
                    this.dataLossCheckPollIntervalInSeconds,
                    this.replicaDropWaitDurationInSeconds,
                    this.requestTimeout,
                    this.operationTimeout);
            }
            else if (actionType == ActionType.InvokeQuorumLoss)
            {
                InvokeQuorumLossState actionState = actionStateBase as InvokeQuorumLossState;

                StepStateNames currentState = actionState.StateProgress.Peek();
                if (currentState == StepStateNames.IntentSaved)
                {
                    actionState.StateProgress.Push(StepStateNames.LookingUpState);
                    await this.actionStore.UpdateActionStateAsync(actionState);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated");
                }

                // This is the case for resuming an action after a failover
                action = new InvokeQuorumLossAction(this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.QuorumLossMode, actionState.Info.QuorumLossDuration, this.requestTimeout, this.operationTimeout);
            }
            else if (actionType == ActionType.RestartPartition)
            {
                RestartPartitionState actionState = actionStateBase as RestartPartitionState;

                StepStateNames currentState = actionState.StateProgress.Peek();
                if (currentState == StepStateNames.IntentSaved)
                {
                    actionState.StateProgress.Push(StepStateNames.LookingUpState);
                    await this.actionStore.UpdateActionStateAsync(actionState);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated");
                }

                // This is the case for resuming an action after a failover
                action = new RestartPartitionAction(this.stateManager, this.Partition, actionState, actionState.Info.PartitionSelector, actionState.Info.RestartPartitionMode, this.requestTimeout, this.operationTimeout);
            }
            else if (actionType == ActionType.TestStuck)
            {
                StuckState actionState = actionStateBase as StuckState;

                StepStateNames currentState = actionState.StateProgress.Peek();
                if (currentState == StepStateNames.IntentSaved)
                {
                    actionState.StateProgress.Push(StepStateNames.LookingUpState);
                    await this.actionStore.UpdateActionStateAsync(actionState);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated");
                }

                action = new StuckAction(this.stateManager, this.Partition, actionState, this.requestTimeout, this.operationTimeout);
            }
            else if (actionType == ActionType.TestRetryStep)
            {
                TestRetryStepState actionState = actionStateBase as TestRetryStepState;

                StepStateNames currentState = actionState.StateProgress.Peek();
                if (currentState == StepStateNames.IntentSaved)
                {
                    actionState.StateProgress.Push(StepStateNames.LookingUpState);
                    await this.actionStore.UpdateActionStateAsync(actionState);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated");
                }

                action = new TestRetryStepAction(this.stateManager, this.Partition, actionState, this.requestTimeout, this.operationTimeout);
            }
            else if (actionType == ActionType.StartNode)
            {
                NodeCommandState actionState = actionStateBase as NodeCommandState;
                actionState.StoppedNodeTable = this.stoppedNodeTable;

                StepStateNames currentState = actionState.StateProgress.Peek();
                if (currentState == StepStateNames.IntentSaved)
                {
                    actionState.StateProgress.Push(StepStateNames.LookingUpState);
                    await this.actionStore.UpdateActionStateAsync(actionState);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated");
                }

                action = new StartNodeFromFASAction(this.stateManager, this.Partition, actionState, this.stoppedNodeTable, this.requestTimeout, this.operationTimeout);
            }
            else if (actionType == ActionType.StopNode)
            {
                NodeCommandState actionState = actionStateBase as NodeCommandState;
                actionState.StoppedNodeTable = this.stoppedNodeTable;

                StepStateNames currentState = actionState.StateProgress.Peek();
                if (currentState == StepStateNames.IntentSaved)
                {
                    actionState.StateProgress.Push(StepStateNames.LookingUpState);
                    await this.actionStore.UpdateActionStateAsync(actionState);

                    TestabilityTrace.TraceSource.WriteInfo(TraceType, "action state has been updated");
                }

                action = new StopNodeFromFASAction(this.stateManager, this.Partition, actionState, this.stoppedNodeTable, this.requestTimeout, this.operationTimeout);
            }
            else
            {
                TestabilityTrace.TraceSource.WriteInfo(TraceType, "Unknown actionType");
            }

            return(action);
        }