public ActionStateBase(Guid operationId, ActionType actionType, ServiceInternalFaultInfo serviceInternalFaultInfo) { this.OperationId = operationId; this.ActionType = actionType; this.ServiceInternalFaultInfo = serviceInternalFaultInfo; this.StateProgress = new Stack <StepStateNames>(); this.StateProgress.Push(StepStateNames.IntentSaved); this.TimeReceived = DateTime.UtcNow; }
// Test only, only accessible from "internal test", not from fabric client public async Task ProcessRetryStepCommandAsync(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo) { ActionStateBase actionState = new TestRetryStepState(operationId, serviceInternalFaultInfo); try { await this.actionStore.InitializeNewActionAsync(actionState, FASConstants.DefaultTestTimeout); this.Enqueue(actionState); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e); throw; } }
public TestRetryStepState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo) : base(operationId, ActionType.TestRetryStep, serviceInternalFaultInfo) { this.RetryStepWithoutRollingBackOnFailure = true; }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { // Intentionally get stuck await Task.Delay(Timeout.Infinite, cancellationToken).ConfigureAwait(false); return(null); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeQuorumLossState state = Convert(this.State); // get info about the service so we can check type and trss ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices); } StatefulServiceDescription statefulServiceDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulServiceDescription == null, string.Format(CultureInfo.InvariantCulture, "{0} - Service is not a stateful service", this.State.OperationId)); if (!statefulServiceDescription.HasPersistedState) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricOnlyValidForStatefulPersistentServicesException("This is only valid for stateful persistent services", FabricErrorCode.OnlyValidForStatefulPersistentServices); } SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; // get data about replicas in that partition ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); List <StatefulServiceReplica> tempReplicas = new List <StatefulServiceReplica>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Expected stateful replica"); tempReplicas.Add(statefulReplica); } List <StatefulServiceReplica> targetReplicas = null; if (this.quorumLossMode == QuorumLossMode.AllReplicas) { targetReplicas = tempReplicas.Where(r => r.ReplicaRole == ReplicaRole.Primary || r.ReplicaRole == ReplicaRole.ActiveSecondary).ToList(); } else if (this.quorumLossMode == QuorumLossMode.QuorumReplicas) { targetReplicas = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, tempReplicas); } else { throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedQuorumLossMode); } if (targetReplicas == null) { // This will cause the command to rollback and retry throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } List <string> targetNodes = new List <string>(); foreach (var replica in targetReplicas) { targetNodes.Add(replica.NodeName); } List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >(); foreach (string nodeName in targetNodes) { UnreliableTransportBehavior behavior = new UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); // ApplyingUnreliableTransport.BehaviorNamePrefix + nodeName; string behaviorName = this.CreateBehaviorName(nodeName); unreliableTransportInfoList.Add(new Tuple <string, string>(nodeName, behaviorName)); } state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.PartitionId = partitionId; state.Info.ReplicaIds = targetReplicas.Select(r => r.Id).ToList(); state.Info.UnreliableTransportInfo = unreliableTransportInfoList; return(state); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { NodeCommandState state = Convert(this.State); // The return value is ignored, this is just being used to check if the RemoveNodeState was called. Node queriedNode = await FaultAnalysisServiceUtility.GetNodeInfoAsync( this.State.OperationId, this.FabricClient, state.Info.NodeName, this.action.Partition, this.action.StateManager, this.action.StoppedNodeTable, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - calling StartNodeUsingNodeNameAsync, ApiInputNodeInstanceId={1}", this.State.OperationId, state.Info.InputNodeInstanceId); Exception exception = null; try { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.FaultManager.StartNodeUsingNodeNameAsync( state.Info.NodeName, state.Info.InputNodeInstanceId, null, 0, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.StartNodeErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, "{0} - StartNodeUsingNodeNameAsync threw {1}", this.State.OperationId, e); exception = e; } cancellationToken.ThrowIfCancellationRequested(); SuccessRetryOrFail status = SuccessRetryOrFail.Invalid; if (exception != null) { FabricException fe = exception as FabricException; if (fe != null) { status = this.HandleFabricException(fe, state); } else { TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, "{0} - StartNodeUsingNodeNameAsync threw non-FabricException with ErrorCode={1}", this.State.OperationId, exception); status = SuccessRetryOrFail.RetryStep; } } else { // success status = SuccessRetryOrFail.Success; await FaultAnalysisServiceUtility.SetStoppedNodeStateAsync( this.action.State.OperationId, this.action.Partition, this.action.StateManager, this.action.StoppedNodeTable, state.Info.NodeName, false, cancellationToken).ConfigureAwait(false); } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); if (status == SuccessRetryOrFail.RetryStep) { throw new RetrySameStepException("retrystep", exception); } else if (status == SuccessRetryOrFail.Fail) { throw new FatalException("fatal", exception); } else if (status == SuccessRetryOrFail.Success) { // no-op } else { ReleaseAssert.Failfast(string.Format(CultureInfo.InvariantCulture, "This condition should not have been hit. OperationId: {0}", this.State.OperationId)); } await this.ValidateAsync(this.FabricClient, state, cancellationToken).ConfigureAwait(false); state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
public InvokeDataLossState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo, PartitionSelector partitionSelector, DataLossMode dataLossMode) : base(operationId, ActionType.InvokeDataLoss, serviceInternalFaultInfo) { this.Info = new InvokeDataLossInfo(partitionSelector, dataLossMode); }
private async Task CleanupStepAsync( FabricClient fabricClient, FabricTestAction action, ActionStateBase actionState, CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { StepStateNames state = actionState.StateProgress.Peek(); TestabilityTrace.TraceSource.WriteInfo(TraceType, "Cleaning up state={0}, name={1}, key={2}", state, actionState.ActionType, actionState.OperationId); StepBase actionUnit = null; actionUnit = action.GetStep(fabricClient, actionState, state, cancellationToken); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Cleaning up {1}", actionState.OperationId, actionUnit.StepName); try { while (true) { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.InnerCleanupLoop).ConfigureAwait(false); if (readRollbackState == RollbackState.RollingBackDueToUserCancel) { // Do nothing, already rolling back TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Inner cleanup loop read RollingBackDueToUserCancel", actionState.OperationId); } else if (readRollbackState == RollbackState.RollingBackForce) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Inner cleanup loop read RollingBackForce", actionState.OperationId); break; } try { await actionUnit.CleanupAsync(cancellationToken).ConfigureAwait(false); actionState.StateProgress.Pop(); break; } catch (Exception cleanupException) { TestabilityTrace.TraceSource.WriteWarning( TraceType, "{0} - Cleanup of action type={1}, failed with {2}, retrying", actionState.OperationId, actionState.ActionType, cleanupException); } await Task.Delay(TimeSpan.FromSeconds(this.commandStepRetryBackoffInSeconds), cancellationToken).ConfigureAwait(false); } } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - CleanupStepAsync, error: {1}", actionState.OperationId, e.ToString()); throw; } }
private async Task RunStepAsync( FabricClient fabricClient, FabricTestAction action, ActionStateBase actionState, CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { StepStateNames state = actionState.StateProgress.Peek(); TestabilityTrace.TraceSource.WriteInfo(TraceType, "Running state={0}, name={1}, key={2}", state, actionState.ActionType, actionState.OperationId); StepBase actionUnit = null; actionUnit = action.GetStep(fabricClient, actionState, state, cancellationToken); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Running {1}", actionState.OperationId, actionUnit.StepName); try { while (true) { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.InnerForwardLoop).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - readRollbackState={1}", actionState.OperationId, readRollbackState); // If RetryStepWithoutRollingbackOnFailure == true, then don't allow graceful user cancel if (!actionState.RetryStepWithoutRollingBackOnFailure && (readRollbackState == RollbackState.RollingBackDueToUserCancel)) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - read RollingBackDueToUserCancel breaking from Run loop inside RunStepAsync()", actionState.OperationId); break; } // RollingBackForce always stops execution if (readRollbackState == RollbackState.RollingBackForce) { break; } Exception runException = null; try { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0}, {1} - calling Step.Run()", actionState.OperationId, actionState.ActionType); ActionStateBase newContext = await actionUnit.RunAsync(cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0}, {1} - calling break after run", actionState.OperationId, actionState.ActionType); break; } catch (Exception runExceptionTemp) { runException = runExceptionTemp; TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0}, {1} - runException {2}", actionState.OperationId, actionState.ActionType, runException); if (actionState.RetryStepWithoutRollingBackOnFailure) { // trace and loop. Should have /backoff/? TestabilityTrace.TraceSource.WriteWarning( TraceType, "{0}, {1} has RetryStepWithoutRollingbackOnFailure set to true, retrying step name='{2}'. Caught exception: {3}", actionState.OperationId, actionState.ActionType, actionUnit.StepName, runException); this.ProcessRetryStepExceptions(actionState.OperationId, runException); } else { throw; } } if (runException != null) { await Task.Delay(TimeSpan.FromSeconds(this.commandStepRetryBackoffInSeconds), cancellationToken).ConfigureAwait(false); } } } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - RunState, error: {1}", actionState.OperationId, e.ToString()); throw; } }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "Inside CollectingState, service={0}", this.partitionSelector.ServiceName); RestartPartitionState state = Convert(this.State); // Get service info and validate if the parameters are valid ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful && this.restartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { // The message in the first arg is only for debugging, it is not returned to the user. string debugText = string.Format(CultureInfo.InvariantCulture, "RestartPartition: for stateless services only RestartPartitionMode.AllReplicasOrInstances is valid"); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, debugText); throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, debugText); } bool hasPersistedState = false; if (result.Kind == ServiceDescriptionKind.Stateful) { StatefulServiceDescription statefulDescription = result as StatefulServiceDescription; ReleaseAssert.AssertIf(statefulDescription == null, "Stateful service description is not WinFabricStatefulServiceDescription"); hasPersistedState = statefulDescription.HasPersistedState; } SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; // get replicas for target ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // get replicas for fm in order to get the primary ServiceReplicaList failoverManagersReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( FASConstants.FmPartitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); string failoverManagerPrimaryNodeName = string.Empty; var readyFMReplicas = failoverManagersReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { failoverManagerPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary is at node={1}", this.State.OperationId, failoverManagerPrimaryNodeName); string behaviorName = RestartingSelectedReplicas.UTBehaviorPrefixName + "_" + this.State.OperationId; List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >(); unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName)); state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.PartitionId = partitionId; state.Info.NodeName = failoverManagerPrimaryNodeName; state.Info.HasPersistedState = hasPersistedState; state.Info.UnreliableTransportInfo = unreliableTransportInfo; return(state); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { RestartPartitionState state = Convert(this.State); Guid partitionId = state.Info.PartitionId; bool hasPersistedState = state.Info.HasPersistedState; string failoverManagerPrimaryNodeName = state.Info.NodeName; string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applied UT on partitionId {1}, node={2}", this.State.OperationId, partitionId, failoverManagerPrimaryNodeName); ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); var stableReplicasToRestart = replicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in stableReplicasToRestart) { if (this.restartPartitionMode == RestartPartitionMode.OnlyActiveSecondaries) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Stateful service replica is not StatefulServiceReplica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { continue; } } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - restarting replica partition={1}, node={2}, replica id={3}", this.State.OperationId, partitionId, replica.NodeName, replica.Id); if (hasPersistedState) { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RestartReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RestartReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } else { await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RemoveReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); await RemoveUnreliableTransportAsync(this.State, this.FabricClient, this.RequestTimeout, this.OperationTimeout, cancellationToken); state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
public InvokeQuorumLossState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo, PartitionSelector partitionSelector, QuorumLossMode quorumLossMode, TimeSpan quorumLossDuration) : base(operationId, ActionType.InvokeQuorumLoss, serviceInternalFaultInfo) { this.Info = new InvokeQuorumLossInfo(partitionSelector, quorumLossMode, quorumLossDuration); }
public async Task ProcessStopNodeCommandAsync(Guid operationId, string nodeName, BigInteger nodeInstanceId, int durationInSeconds, TimeSpan timeout, ServiceInternalFaultInfo serviceInternalFaultInfo) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - ProcessStopNodeCommandAsync, duration is {1}", operationId, durationInSeconds); this.entitySynch.NodeSynchronizer.Add(nodeName); NodeCommandState actionState = new NodeCommandState(ActionType.StopNode, operationId, this.entitySynch.NodeSynchronizer, serviceInternalFaultInfo, nodeName, nodeInstanceId, durationInSeconds); try { // After this call finishes the intent has been persisted await this.actionStore.InitializeNewActionAsync(actionState, timeout); this.Enqueue(actionState); } catch (Exception e) { this.entitySynch.NodeSynchronizer.Remove(nodeName); TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e); throw; } }
// Use this method signature for now until the actual client interface is decided public async Task ProcessRestartPartitionCommandAsync(Guid operationId, PartitionSelector partitionSelector, RestartPartitionMode restartPartitionMode, TimeSpan timeout, ServiceInternalFaultInfo serviceInternalFaultInfo) { ThrowIfRestartPartitionModeInvalid(restartPartitionMode); RestartPartitionState actionState = new RestartPartitionState(operationId, serviceInternalFaultInfo, partitionSelector, restartPartitionMode); try { // After this call finishes the intent has been persisted await this.actionStore.InitializeNewActionAsync(actionState, timeout); this.Enqueue(actionState); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Exception {1}", operationId, e); throw; } }
public StuckState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo) : base(operationId, ActionType.TestStuck, serviceInternalFaultInfo) { }
public RestartPartitionState(Guid operationId, ServiceInternalFaultInfo serviceInternalFaultInfo, PartitionSelector partitionSelector, RestartPartitionMode restartPartitionMode) : base(operationId, ActionType.RestartPartition, serviceInternalFaultInfo) { this.Info = new RestartPartitionInfo(partitionSelector, restartPartitionMode); }
private async Task RunStateMachineAsync( FabricClient fabricClient, FabricTestAction action, ActionStateBase actionState, ServiceInternalFaultInfo serviceInternalFaultInfo, CancellationToken cancellationToken) { if (actionState.StateProgress == null || actionState.StateProgress.Count == 0) { ReleaseAssert.AssertIf(actionState.StateProgress == null || actionState.StateProgress.Count == 0, "ActionProgress should not be null or empty"); } Exception actionError = null; if (actionState.RollbackState == RollbackState.NotRollingBack || (actionState.RollbackState != RollbackState.RollingBackForce && actionState.RetryStepWithoutRollingBackOnFailure)) { // TODO: should also include Error while (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully) { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.ForwardLoop).ConfigureAwait(false); if ((readRollbackState == RollbackState.RollingBackForce) || ((readRollbackState == RollbackState.RollingBackDueToUserCancel) && !actionState.RetryStepWithoutRollingBackOnFailure)) { break; } try { await this.RunStepAsync(fabricClient, action, actionState, cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false); ActionTest.PerformInternalServiceFaultIfRequested(actionState.OperationId, serviceInternalFaultInfo, actionState, cancellationToken); if (actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - completed successfully, clearing ErrorCausingRollback", actionState.OperationId); actionState.ErrorCausingRollback = 0; } actionState.TimeStopped = DateTime.UtcNow; await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } catch (RetrySameStepException) { // Retry the command in the same step - do not rollback or go forward, and do not call ActionStore.UpdateActionStateAsync(). TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - threw RetrySameStepException, retrying state {1} ", actionState.StateProgress.Peek()); } catch (FabricNotPrimaryException) { throw; } catch (FabricObjectClosedException) { throw; } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - RunStateMachineAsync caught: {1}", actionState.OperationId, e.ToString()); readRollbackState = this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.ForwardLoopExceptionBlock).GetAwaiter().GetResult(); // 1st line: if this is a force rollback (RollingBackForce), just exit // 2nd line: if !RetryStepWithoutRollingBackOnFailure and there was a graceful cancel then exit this block and proceed to the rollback code block below. // If RetryStepWithoutRollingBackOnFailure is true, which it is only for the node steps today, then first call HandleRollback to translate the exception. if ((readRollbackState == RollbackState.RollingBackForce) || ((readRollbackState == RollbackState.RollingBackDueToUserCancel) && !actionState.RetryStepWithoutRollingBackOnFailure)) { break; } else { bool isRetryable = this.HandleRollback(actionState.OperationId, e); if (isRetryable) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - observed retryable exception, will retry action. Exception: {1}", actionState.OperationId, e.ToString()); actionState.RollbackState = RollbackState.RollingBackAndWillRetryAction; } else { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - observed non-retryable exception. Exception: {1}", actionState.OperationId, e.ToString()); actionState.RollbackState = RollbackState.RollingBackAndWillFailAction; } } actionError = e; break; } } } if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction || actionState.RollbackState == RollbackState.RollingBackAndWillFailAction || (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel && (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully))) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Rollingback type={1}", actionState.OperationId, actionState.ActionType); if (!this.isTestMode && actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully) { string error = string.Format(CultureInfo.InvariantCulture, "{0} - state should not be CompletedSuccessfully", actionState.OperationId); TestabilityTrace.TraceSource.WriteError(TraceType, error); ReleaseAssert.Failfast(error); } // If actionError is not null it means we are currently running a resumed rollback. In that case the ErrorCausingRollback must have // already been set. if (actionError != null) { actionState.ErrorCausingRollback = TranslateRollbackError(actionError.HResult); TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Translated ErrorCausingRollback ={1}", actionState.OperationId, actionState.ErrorCausingRollback); } if (this.isTestMode && actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully) { // In test mode it's intentionally possible to fault an action after it's completed its work, but before the state name has been updated. actionState.StateProgress.Pop(); } await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); try { while (actionState.StateProgress.Peek() != StepStateNames.IntentSaved && actionState.StateProgress.Peek() != StepStateNames.Failed) { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.OuterCleanupLoop).ConfigureAwait(false); if (readRollbackState == RollbackState.RollingBackDueToUserCancel) { // Do nothing, already rolling back - debug only TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Read RollingBackDueToUserCancel in outer rollback loop", actionState.OperationId); } else if (readRollbackState == RollbackState.RollingBackForce) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} - Read RollingBackForce in outer rollback loop", actionState.OperationId); break; } StepStateNames currentStateName = actionState.StateProgress.Peek(); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - DEBUG - Rollback path loop, current state {1}", actionState.OperationId, actionState.StateProgress.Peek()); try { await this.CleanupStepAsync(fabricClient, action, actionState, cancellationToken, serviceInternalFaultInfo).ConfigureAwait(false); await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } catch (FabricNotPrimaryException) { throw; } catch (FabricObjectClosedException) { throw; } catch (Exception e) { ReleaseAssert.Failfast("Unexpected exception, RunStateAsync for cleanup should have handled {0}", e); } } // If this is true rollback is finished. If it is retryable set the state to LookingUpState if (actionState.StateProgress.Peek() == StepStateNames.IntentSaved) { if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction) { actionState.StateProgress.Push(StepStateNames.LookingUpState); actionState.ClearInfo(); } else if (actionState.RollbackState == RollbackState.RollingBackAndWillFailAction) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.RollbackState = RollbackState.NotRollingBack; actionState.TimeStopped = DateTime.UtcNow; } else if (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.TimeStopped = DateTime.UtcNow; } else if (actionState.RollbackState == RollbackState.RollingBackForce) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.TimeStopped = DateTime.UtcNow; } else { string error = string.Format(CultureInfo.InvariantCulture, "{0} - RollbackState == NotRollingBack not expected", actionState.OperationId); ReleaseAssert.Failfast(error); } } else if (actionState.RollbackState == RollbackState.RollingBackForce) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.TimeStopped = DateTime.UtcNow; } } catch (OperationCanceledException) { // This means the cancellation token is set, not that an api call observed an E_ABORT throw; } catch (FabricNotPrimaryException) { throw; } catch (FabricObjectClosedException) { throw; } catch (Exception e) { ReleaseAssert.Failfast("Unexpected exception, RunStateAsync for cleanup should have handled {0}", e); } TestabilityTrace.TraceSource.WriteInfo( TraceType, "{0} - Action failed, type='{1}', will retry={2}, RollbackState={3}", actionState.OperationId, actionState.ActionType, actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction ? "true" : "false", actionState.RollbackState); await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } else if (actionState.StateProgress.Peek() == StepStateNames.CompletedSuccessfully) { // user cancelled, but action/command completed anyways before cancellation was checked. TestabilityTrace.TraceSource.WriteInfo(TraceType, "DEBUG {0} - Action type '{1}' completed successfully, not updating again ", actionState.OperationId, actionState.ActionType); } else if ((actionState.StateProgress.Peek() == StepStateNames.IntentSaved) && (actionState.RollbackState == RollbackState.RollingBackDueToUserCancel)) { actionState.StateProgress.Push(StepStateNames.Failed); actionState.TimeStopped = DateTime.UtcNow; await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } else if (actionState.RollbackState == RollbackState.RollingBackForce) { // Note: unlike the case above this does not have a state of IntentSaved as a requirement since a force rollback is an abort and does run the steps in reverse. // It is possible for the StateProgress to be CompletedSuccessfully here, since we want to exit as quickly as possible. In that case, the block 2 blocks above handles it - // we do nothing extra, and the command finishes executing. If the user calls an api for information on this command, we translate the state to ForceCancelled if state is a terminal state // and RollbackState is RollingBackForce. See ActionStore.MatchesStateFilter(). actionState.TimeStopped = DateTime.UtcNow; TestabilityTrace.TraceSource.WriteWarning(TraceType, "Bottom of Engine.RunAsync() - state is={0}, rollbackState={1}", actionState.StateProgress.Peek().ToString(), actionState.RollbackState.ToString()); actionState.StateProgress.Push(StepStateNames.Failed); await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); } else { string unexpectedError = string.Format(CultureInfo.InvariantCulture, "Unexpected case reached, state is={0}, rollbackState={1}", actionState.StateProgress.Peek().ToString(), actionState.RollbackState.ToString()); TestabilityTrace.TraceSource.WriteError(TraceType, "{0}", unexpectedError); ReleaseAssert.Failfast(unexpectedError); } }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Inside TestRetryStepStepTwo - this should retry w/o rollback when exception is thrown", this.State.OperationId); TestRetryStepState castedState = Convert(this.State); // Simulate work await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Inside TestRetryStepStepTwo - before PerformInternalServiceFaultIfRequested", this.State.OperationId); ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Inside TestRetryStepStepTwo - after PerformInternalServiceFaultIfRequested", this.State.OperationId); this.State.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(this.State); }
public async Task RunAsync(FabricClient fc, FabricTestAction action, ActionStateBase actionState, ServiceInternalFaultInfo serviceInternalFaultInfo, CancellationToken cancellationToken) { TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Inside RunAsync of Engine, entering state machine", actionState.OperationId); try { do { cancellationToken.ThrowIfCancellationRequested(); RollbackState readRollbackState = await this.CheckUserCancellationAndUpdateIfNeededAsync(actionState, cancellationToken, FASConstants.OuterLoop).ConfigureAwait(false); // For the non-force case we need to cleanup, so that is why there's no break statement in that case. if (readRollbackState == RollbackState.RollingBackForce) { actionState.StateProgress.Push(StepStateNames.Failed); await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); break; } await this.RunStateMachineAsync(fc, action, actionState, serviceInternalFaultInfo, cancellationToken).ConfigureAwait(false); if (actionState.RollbackState == RollbackState.RollingBackAndWillRetryAction) { actionState.ErrorCausingRollback = 0; int pauseTime = this.random.Next(10, 60); TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Pausing for {1} seconds before retrying", actionState.OperationId, pauseTime); // Clear the rollback state so it will go forward when it resumes. actionState.RollbackState = RollbackState.NotRollingBack; await this.actionStore.UpdateActionStateAsync(actionState).ConfigureAwait(false); await Task.Delay(TimeSpan.FromSeconds(pauseTime), cancellationToken).ConfigureAwait(false); } }while (actionState.StateProgress.Peek() != StepStateNames.CompletedSuccessfully && actionState.StateProgress.Peek() != StepStateNames.Failed); } catch (FabricNotPrimaryException notPrimary) { FaultAnalysisServiceUtility.TraceFabricNotPrimary(actionState.OperationId, notPrimary); } catch (FabricObjectClosedException objectClosed) { FaultAnalysisServiceUtility.TraceFabricObjectClosed(actionState.OperationId, objectClosed); } catch (Exception e) { TestabilityTrace.TraceSource.WriteWarning(TraceType, "{0} caught exception - {1}", actionState.OperationId, e); throw; } TestabilityTrace.TraceSource.WriteInfo(TraceType, "{0} - Exiting state machine", actionState.OperationId); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeDataLossState state = Convert(this.State); PartitionSelector partitionSelector = state.Info.PartitionSelector; DataLossMode dataLossMode = state.Info.DataLossMode; long preDataLossNumber = state.Info.DataLossNumber; string failoverManagerPrimaryNodeName = state.Info.NodeName; Guid partitionId = state.Info.PartitionId; string behaviorName = state.Info.UnreliableTransportInfo.First().Item2; int targetReplicaSetSize = state.Info.TargetReplicaSetSize; TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying UT, partitionId={1}", this.State.OperationId, partitionId); System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "DoReconfiguration"); behavior.AddFilterForPartitionId(partitionId); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( failoverManagerPrimaryNodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); // TODO: Wait for some time so that the unreliable transport behavior can be read from the files. // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); ServiceReplicaList replicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( partitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); List <StatefulServiceReplica> replicaList = new List <StatefulServiceReplica>(); foreach (var replica in replicasResult) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "Service Replica is not of stateful type even though service is stateful"); replicaList.Add(statefulReplica); } // Select target replicas based on the DataLosMode List <StatefulServiceReplica> targets = null; if (dataLossMode == DataLossMode.FullDataLoss) { targets = GetReplicasForFullDataLoss(replicaList); } else if (dataLossMode == DataLossMode.PartialDataLoss) { targets = FaultAnalysisServiceUtility.GetReplicasForPartialLoss(state.OperationId, replicaList); } else { throw FaultAnalysisServiceUtility.CreateException(StepBase.TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.E_INVALIDARG, Strings.StringResources.Error_UnsupportedDataLossMode); } if (targets == null) { // This will cause the command to rollback and retry throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } foreach (var replica in targets) { TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Removing replica {1} in partition {2} with role {3} and status {4} to induce data loss", this.State.OperationId, replica.Id, partitionId, replica.ReplicaRole, replica.ReplicaStatus); await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.RemoveReplicaAsync( replica.NodeName, partitionId, replica.Id, this.RequestTimeout, cancellationToken), FabricClientRetryErrors.RemoveReplicaErrors.Value, this.OperationTimeout, cancellationToken).ConfigureAwait(false); } ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); await this.WaitForAllTargetReplicasToGetDroppedAsync(partitionId, targets, cancellationToken).ConfigureAwait(false); await RemoveUnreliableTransportAsync(this.FabricClient, failoverManagerPrimaryNodeName, behaviorName, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool dataLossWasSuccessful = false; TimeoutHelper timeoutHelper = new TimeoutHelper(TimeSpan.FromSeconds(30)); do { ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool partitionFound = false; long postDataLossNumber = 0; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { postDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - Checking data loss numbers for partition {1} with remaining time {2}. Current numbers {3}:{4}", this.State.OperationId, partitionId, timeoutHelper.GetRemainingTime(), preDataLossNumber, postDataLossNumber); if (postDataLossNumber != preDataLossNumber) { dataLossWasSuccessful = true; break; } await System.Fabric.Common.AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(this.dataLossCheckPollIntervalInSeconds), cancellationToken).ConfigureAwait(false); }while (timeoutHelper.GetRemainingTime() > TimeSpan.Zero); if (!dataLossWasSuccessful) { // This is only viewable internally for debug. This will cause a retry of the whole flow. string error = string.Format( CultureInfo.InvariantCulture, "{0} - Service could not induce data loss for service '{1}' partition '{2}' in '{3}' Please retry", this.State.OperationId, partitionSelector.ServiceName, partitionId, this.dataLossCheckWaitDurationInSeconds); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error); throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { NodeCommandState state = Convert(this.State); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StopNode.LookingUpState performing node query", this.State.OperationId); Node queriedNode = await FaultAnalysisServiceUtility.GetNodeInfoAsync( this.State.OperationId, this.FabricClient, state.Info.NodeName, this.action.Partition, this.action.StateManager, this.action.StoppedNodeTable, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StopNode.LookingUpState node query completed", this.State.OperationId); // Check for bad state if (queriedNode == null || queriedNode.NodeStatus == NodeStatus.Invalid || queriedNode.NodeStatus == NodeStatus.Unknown || queriedNode.NodeStatus == NodeStatus.Removed) { // Fail the command Exception nodeNotFoundException = FaultAnalysisServiceUtility.CreateException( TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_NOT_FOUND, string.Format(CultureInfo.InvariantCulture, "{0} - Node {1} does not exist", this.State.OperationId, state.Info.NodeName)); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - throwing fatal exception {1}", this.State.OperationId, nodeNotFoundException); throw new FatalException("fatal", nodeNotFoundException); } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StopNode LookingUpState reading RD", this.State.OperationId); bool isStopped = await FaultAnalysisServiceUtility.ReadStoppedNodeStateAsync( this.State.OperationId, this.action.Partition, this.action.StateManager, this.action.StoppedNodeTable, state.Info.NodeName, cancellationToken).ConfigureAwait(false); if (queriedNode.NodeStatus == NodeStatus.Down && isStopped) { // Node already stopped Exception nodeAlreadyStopped = FaultAnalysisServiceUtility.CreateException( TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_ALREADY_STOPPED, string.Format(CultureInfo.InvariantCulture, "Node {0} is already stopped", state.Info.NodeName)); throw new FatalException("fatal", nodeAlreadyStopped); } else if (queriedNode.NodeStatus != NodeStatus.Down && isStopped) { // FM says the node is up, so FAS has incorrect state, perhaps because of an out of band start from the original deprecated api. // Correct the state, then continue to run this command normally. It is valid. await FaultAnalysisServiceUtility.SetStoppedNodeStateAsync( this.action.State.OperationId, this.action.Partition, this.action.StateManager, this.action.StoppedNodeTable, queriedNode.NodeName, false, cancellationToken).ConfigureAwait(false); } else if (queriedNode.NodeStatus == NodeStatus.Down && !isStopped) { // Node is down (as opposed to stopped) Exception nodeIsDown = FaultAnalysisServiceUtility.CreateException( TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_IS_DOWN, string.Format(CultureInfo.InvariantCulture, "Node {0} is down", state.Info.NodeName)); throw new FatalException("fatal", nodeIsDown); } state.Info.InitialQueriedNodeStatus = queriedNode.NodeStatus; state.Info.NodeWasInitiallyInStoppedState = isStopped; TestabilityTrace.TraceSource.WriteInfo( StepBase.TraceType, "{0} - StopNode LookingUpState InitialQueriedNodeStatus='{1}', NodeWasInitiallyInStoppedState='{2}'", this.State.OperationId, state.Info.InitialQueriedNodeStatus, state.Info.NodeWasInitiallyInStoppedState); state.StateProgress.Push(StepStateNames.PerformingActions); return(state); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeDataLossState state = Convert(this.State); ServiceDescription result = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.ServiceManager.GetServiceDescriptionAsync( this.partitionSelector.ServiceName, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); if (result.Kind != ServiceDescriptionKind.Stateful) { // The message in the first arg is only for debugging, it is not returned to the user. throw new FabricInvalidForStatelessServicesException("FabricInvalidForStatelessServicesException", FabricErrorCode.InvalidForStatelessServices); } int targetReplicaSetSize = (result as StatefulServiceDescription).TargetReplicaSetSize; SelectedPartition targetPartition = await FaultAnalysisServiceUtility.GetSelectedPartitionStateAsync( this.FabricClient, this.partitionSelector, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); Guid partitionId = targetPartition.PartitionId; long preDataLossNumber = 0; ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); bool partitionFound = false; foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { preDataLossNumber = partition.PrimaryEpoch.DataLossNumber; partitionFound = true; break; } } if (!partitionFound) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionNotFound), FabricErrorCode.PartitionNotFound); } ServiceReplicaList failoverManagerReplicasResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetReplicaListAsync( FASConstants.FmPartitionId, 0, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); string failoverManagerPrimaryNodeName = string.Empty; var readyFMReplicas = failoverManagerReplicasResult.Where(r => r.ReplicaStatus == ServiceReplicaStatus.Ready).ToArray(); foreach (var replica in readyFMReplicas) { StatefulServiceReplica statefulReplica = replica as StatefulServiceReplica; ReleaseAssert.AssertIf(statefulReplica == null, "FM Replica is not a stateful replica"); if (statefulReplica.ReplicaRole == ReplicaRole.Primary) { failoverManagerPrimaryNodeName = replica.NodeName; } } if (string.IsNullOrEmpty(failoverManagerPrimaryNodeName)) { throw new FabricException(StringHelper.Format(StringResources.Error_PartitionPrimaryNotReady, "FailoverManager"), FabricErrorCode.NotReady); } TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - FM primary location={1}", this.State.OperationId, failoverManagerPrimaryNodeName); string behaviorName = "BlockDoReconfiguration_" + this.State.OperationId; List <Tuple <string, string> > unreliableTransportInfo = new List <Tuple <string, string> >(); unreliableTransportInfo.Add(new Tuple <string, string>(failoverManagerPrimaryNodeName, behaviorName)); state.StateProgress.Push(StepStateNames.PerformingActions); state.Info.DataLossNumber = preDataLossNumber; state.Info.NodeName = failoverManagerPrimaryNodeName; state.Info.PartitionId = partitionId; state.Info.UnreliableTransportInfo = unreliableTransportInfo; state.Info.TargetReplicaSetSize = targetReplicaSetSize; return(state); }
public abstract Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo);
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - Inside TestRetryStepStepOne", this.State.OperationId); // Simulate work await Task.Delay(TimeSpan.FromMilliseconds(500)).ConfigureAwait(false); this.State.StateProgress.Push(StepStateNames.PerformingActions); return(this.State); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { NodeCommandState state = Convert(this.State); Node queriedNode = await FaultAnalysisServiceUtility.GetNodeInfoAsync( this.State.OperationId, this.FabricClient, state.Info.NodeName, this.action.Partition, this.action.StateManager, this.action.StoppedNodeTable, this.RequestTimeout, this.OperationTimeout, cancellationToken).ConfigureAwait(false); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StartNode LookingUpState reading RD", this.State.OperationId); bool isStopped = await FaultAnalysisServiceUtility.ReadStoppedNodeStateAsync( this.State.OperationId, this.action.Partition, this.action.StateManager, this.action.StoppedNodeTable, state.Info.NodeName, cancellationToken).ConfigureAwait(false); if (FaultAnalysisServiceUtility.IsNodeRunning(queriedNode)) { if (!isStopped) { // For illustration, if you just called StartNodeUsingNodeNameAsync() in this situation w/o checking first, you'd either get instance mismatch or node has not stopped yet // Note: this is different than the logic in the PerformingActions step (the former does not check instance id, the latter does), which is after the call to StartNodeUsingNodeNameAsync(), because // this is a precondition check. Exception nodeAlreadyUp = FaultAnalysisServiceUtility.CreateException( TraceType, NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_IS_UP, string.Format(CultureInfo.InvariantCulture, "Node {0} already started", state.Info.NodeName), FabricErrorCode.NodeIsUp); throw new FatalException("fatal", nodeAlreadyUp); } else { // The only way this can happen is OOB start. FAS should fix it's incorrect state then fail the command with // node already up. TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - StartNode LookingUpState setting RD entry for node {1} to not stopped", this.State.OperationId, state.Info.NodeName); await FaultAnalysisServiceUtility.SetStoppedNodeStateAsync( this.action.State.OperationId, this.action.Partition, this.action.StateManager, this.action.StoppedNodeTable, queriedNode.NodeName, false, cancellationToken).ConfigureAwait(false); Exception nodeIsUp = FaultAnalysisServiceUtility.CreateException( TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_IS_UP, string.Format(CultureInfo.InvariantCulture, "Node {0} is up", state.Info.NodeName)); throw new FatalException("fatal", nodeIsUp); } } else if (queriedNode.NodeStatus == NodeStatus.Down && !isStopped) { // This is a special scenario that can happen if: // 1) There was an OOB stop using the old api // 2) A node went down (not stopped, down) // Don't handle this, return node down. Exception nodeIsDown = FaultAnalysisServiceUtility.CreateException( TraceType, Interop.NativeTypes.FABRIC_ERROR_CODE.FABRIC_E_NODE_IS_DOWN, string.Format(CultureInfo.InvariantCulture, "Node {0} is down", state.Info.NodeName)); throw new FatalException("fatal", nodeIsDown); } state.Info.InitialQueriedNodeStatus = queriedNode.NodeStatus; state.Info.NodeWasInitiallyInStoppedState = isStopped; state.StateProgress.Push(StepStateNames.PerformingActions); return(state); }
public Task InsertCommandAsync(Command command) { Uri failoverManagerUri = new Uri("fabric:/System/FailoverManagerService"); TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "****Adding command: " + command); Task task = null; if (command == Command.FailoverManagerDataLoss) { PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); Guid id = MockClientCommandInfo[Command.FailoverManagerDataLoss]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, null); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.InvokeDataLossMidActionTestFatal) { PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackAction); Guid id = MockClientCommandInfo[Command.InvokeDataLossMidActionTestFatal]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed); } else if (command == Command.InvokeDataLossMidActionTestTransient) { // rollback and retry then success PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackActionAndRetry); Guid id = MockClientCommandInfo[Command.InvokeDataLossMidActionTestTransient]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.InvokeDataLossMidActionTestFailover) { // failover then success PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.KillProcess); Guid id = MockClientCommandInfo[Command.InvokeDataLossMidActionTestFailover]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.FailoverManagerDataLossCauseActionRollbackFatal) { PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.CompletedSuccessfully, ServiceInternalFaultType.RollbackAction); Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossCauseActionRollbackFatal]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed); } else if (command == Command.FailoverManagerDataLossCauseActionRollbackWithSuccessOnRetry) { PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.CompletedSuccessfully, ServiceInternalFaultType.RollbackActionAndRetry); Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossCauseActionRollbackWithSuccessOnRetry]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.FailoverManagerDataLossFailoverFaultAnalysisService) { PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.PerformingActions, ServiceInternalFaultType.KillProcess); Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossFailoverFaultAnalysisService]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.FailoverManagerDataLossCauseActionRollbackFatalBeforeActionStep) { PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackAction); Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossCauseActionRollbackFatalBeforeActionStep]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed); } else if (command == Command.FailoverManagerDataLossCauseActionRollbackWithSuccessOnRetryBeforeActionStep) { PartitionSelector ps = PartitionSelector.SingletonOf(failoverManagerUri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackActionAndRetry); Guid id = MockClientCommandInfo[Command.FailoverManagerDataLossCauseActionRollbackWithSuccessOnRetryBeforeActionStep]; task = this.messageProcessor.ProcessDataLossCommandAsync(id, ps, DataLossMode.FullDataLoss, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.InvokeQuorumLossMidActionTestFatal) { Uri uri = new Uri("fabric:/System/NamingService"); PartitionSelector ps = PartitionSelector.RandomOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackAction); Guid id = MockClientCommandInfo[Command.InvokeQuorumLossMidActionTestFatal]; task = this.messageProcessor.ProcessQuorumLossCommandAsync(id, ps, QuorumLossMode.AllReplicas, TimeSpan.FromSeconds(10.0d), FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed); } else if (command == Command.InvokeQuorumLossMidActionTestFailover) { Uri uri = new Uri("fabric:/System/NamingService"); PartitionSelector ps = PartitionSelector.RandomOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.KillProcess); Guid id = MockClientCommandInfo[Command.InvokeQuorumLossMidActionTestFailover]; task = this.messageProcessor.ProcessQuorumLossCommandAsync(id, ps, QuorumLossMode.AllReplicas, TimeSpan.FromSeconds(10.0d), FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.InvokeQuorumLossMidActionTestTransient) { Uri uri = new Uri("fabric:/System/NamingService"); PartitionSelector ps = PartitionSelector.RandomOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackActionAndRetry); Guid id = MockClientCommandInfo[Command.InvokeQuorumLossMidActionTestTransient]; task = this.messageProcessor.ProcessQuorumLossCommandAsync(id, ps, QuorumLossMode.AllReplicas, TimeSpan.FromSeconds(10.0d), FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.RestartPartitionMidActionTestFatal) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackAction); Guid id = MockClientCommandInfo[Command.RestartPartitionMidActionTestFatal]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed); } else if (command == Command.RestartPartitionMidActionTestFailover) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.KillProcess); Guid id = MockClientCommandInfo[Command.RestartPartitionMidActionTestFailover]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.RestartPartitionMidActionTestTransient) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo(StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackActionAndRetry); Guid id = MockClientCommandInfo[Command.RestartPartitionMidActionTestTransient]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully); } else if (command == Command.StuckAction) { Guid id = MockClientCommandInfo[Command.StuckAction]; task = this.messageProcessor.ProcessStuckCommandAsync(id, null); } else if (command == Command.RestartPartitionCancelOuterLoopNoForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.None, // for this case this value does not matter ServiceInternalFaultType.None, RollbackState.RollingBackDueToUserCancel, FASConstants.OuterLoop); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelOuterLoopNoForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel); } else if (command == Command.RestartPartitionCancelForwardNoForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.None, ServiceInternalFaultType.None, RollbackState.RollingBackDueToUserCancel, FASConstants.ForwardLoop); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelForwardNoForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel); } else if (command == Command.RestartPartitionCancelForwardExceptionNoForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackDueToUserCancel, FASConstants.ForwardLoopExceptionBlock); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelForwardExceptionNoForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel); } else if (command == Command.RestartPartitionCancelOuterCleanupNoForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackDueToUserCancel, FASConstants.OuterCleanupLoop); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelOuterCleanupNoForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel); } else if (command == Command.RestartPartitionCancelCleanupInnerNoForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackDueToUserCancel, FASConstants.InnerCleanupLoop); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelCleanupInnerNoForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackDueToUserCancel); } else if (command == Command.RestartPartitionCancelOuterLoopForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.PerformingActions, // for this case, this value does not matter ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackForce, FASConstants.ForwardLoop); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelOuterLoopForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce); } else if (command == Command.RestartPartitionCancelForwardForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackForce, FASConstants.ForwardLoop); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelForwardForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce); } else if (command == Command.RestartPartitionCancelForwardExceptionForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackForce, FASConstants.ForwardLoopExceptionBlock); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelForwardExceptionForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce); } else if (command == Command.RestartPartitionCancelOuterCleanupForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackForce, FASConstants.OuterCleanupLoop); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelOuterCleanupForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce); } else if (command == Command.RestartPartitionCancelCleanupInnerForce) { Uri uri = new Uri("fabric:/System/ClusterManagerService"); PartitionSelector ps = PartitionSelector.SingletonOf(uri); ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.PerformingActions, ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackForce, FASConstants.InnerCleanupLoop); Guid id = MockClientCommandInfo[Command.RestartPartitionCancelCleanupInnerForce]; task = this.messageProcessor.ProcessRestartPartitionCommandAsync(id, ps, RestartPartitionMode.AllReplicasOrInstances, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce); } else if (command == Command.TestRetryStepWithSuccessAfterRetries) { // Intentionally fail the step corresponding to StepStateNames.PerformingActions step a few times, then run it normally (pass). It should succeed. Guid id = MockClientCommandInfo[Command.TestRetryStepWithSuccessAfterRetries]; ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.MidPerformingActions, ServiceInternalFaultType.ThrowThreeTimes); task = this.messageProcessor.ProcessRetryStepCommandAsync(id, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully, RollbackState.NotRollingBack); } else if (command == Command.TestRetryStepWithForceCancel) { // Force cancel a command with ActionStateBase.RetryStepWithoutRollingBackOnFailure set to true Guid id = MockClientCommandInfo[Command.TestRetryStepWithForceCancel]; ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.CompletedSuccessfully, // this just has to be a late step so an earlier fault is not used before we reach the situation we want. ServiceInternalFaultType.RollbackAction, RollbackState.RollingBackForce, // note, the graceful one should not cause cancellation since for this type we only allow user cancellation when force is true FASConstants.InnerForwardLoop, StepStateNames.PerformingActions); task = this.messageProcessor.ProcessRetryStepCommandAsync(id, faultInfo); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce); } else if (command == Command.StopNodeWithUnknownException) { Guid id = MockClientCommandInfo[Command.StopNodeWithUnknownException]; ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.MidPerformingActions, ServiceInternalFaultType.RollbackAction); // In this case, since start and stop node do not rollback like other commands, this exception should cause the step to retry. Node target = ActionTest.GetNodeWithFASSecondary().Result; TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} stopping {1}:{2}", id, target.NodeName, target.NodeInstanceId); task = this.messageProcessor.ProcessStopNodeCommandAsync(id, target.NodeName, target.NodeInstanceId, 999, FASConstants.DefaultTestTimeout, faultInfo); // Let the command make progress Task.Delay(TimeSpan.FromSeconds(30)).Wait(); this.WaitForState(id, StepStateNames.PerformingActions, RollbackState.NotRollingBack); // This should not result in cancellation, since start and stop node have different rollback policies than the other commands. TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} - cancelling with force==false. This should not cancel the command", id); this.messageProcessor.CancelTestCommandAsync(id, false); this.WaitForState(id, StepStateNames.PerformingActions, RollbackState.RollingBackDueToUserCancel); // Now force cancel. This should cancel. TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} - cancelling with force==true. This should cancel the command", id); this.messageProcessor.CancelTestCommandAsync(id, true); this.WaitForState(id, StepStateNames.Failed, RollbackState.RollingBackForce); NodeList nodes = ActionTest.GetNodeListAsync().Result; TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} - node info:", id); foreach (Node n in nodes) { TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, " OperationId:{0} - NodeName{1}, NodeStatus:{2}, IsStopped:{3}", id, n.NodeName, n.NodeStatus, n.IsStopped); } Node targetNodeAfterTest = nodes.Where(n => n.NodeName == target.NodeName).FirstOrDefault(); if (targetNodeAfterTest == null) { throw new InvalidOperationException("target node was not found in query after test"); } if (targetNodeAfterTest.IsStopped == false) { throw new InvalidOperationException("target node should have IsStopped true, was false"); } } else if (command == Command.StopNodeWithExceptionAndSuccessAfterRetries) { Guid id = MockClientCommandInfo[Command.StopNodeWithExceptionAndSuccessAfterRetries]; // Inject a fault during the operation so that step "StepStateNames.MidPerformingActions" has to retry 3 times before succeeding ServiceInternalFaultInfo faultInfo = new ServiceInternalFaultInfo( StepStateNames.MidPerformingActions, ServiceInternalFaultType.ThrowThreeTimes); Node target = ActionTest.GetNodeWithFASSecondary().Result; TestabilityTrace.TraceSource.WriteInfo(MockClient.TraceType, "{0} stopping {1}:{2}", id, target.NodeName, target.NodeInstanceId); task = this.messageProcessor.ProcessStopNodeCommandAsync(id, target.NodeName, target.NodeInstanceId, 999, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully, RollbackState.NotRollingBack); // Start the stopped node task = this.messageProcessor.ProcessStartNodeCommandAsync(Guid.NewGuid(), target.NodeName, target.NodeInstanceId, FASConstants.DefaultTestTimeout, faultInfo); this.WaitForState(id, StepStateNames.CompletedSuccessfully, RollbackState.NotRollingBack); } else { ReleaseAssert.Failfast("Unexpected command"); } return(task); }
public override async Task <ActionStateBase> RunAsync(CancellationToken cancellationToken, ServiceInternalFaultInfo serviceInternalFaultInfo) { InvokeQuorumLossState state = Convert(this.State); Guid partitionId = state.Info.PartitionId; List <Tuple <string, string> > unreliableTransportInfo = state.Info.UnreliableTransportInfo; List <long> targetReplicas = state.Info.ReplicaIds; var unreliableTransportTaskList = new List <Task>(); List <Tuple <string, string> > unreliableTransportInfoList = new List <Tuple <string, string> >(); foreach (Tuple <string, string> ut in unreliableTransportInfo) { string nodeName = ut.Item1; string behaviorName = ut.Item2; System.Fabric.Common.UnreliableTransportBehavior behavior = new System.Fabric.Common.UnreliableTransportBehavior("*", "StatefulServiceReopen"); behavior.AddFilterForPartitionId(partitionId); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - applying '{1}'", this.State.OperationId, behaviorName); unreliableTransportTaskList.Add(FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.TestManager.AddUnreliableTransportBehaviorAsync( nodeName, behaviorName, behavior, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken)); } await Task.WhenAll(unreliableTransportTaskList).ConfigureAwait(false); // Bug#2271465 - Unreliable transport through API should return only once the behavior has been successfully applied await Task.Delay(TimeSpan.FromSeconds(5.0), cancellationToken).ConfigureAwait(false); List <Task> tasks = new List <Task>(); foreach (long replicaId in targetReplicas) { ReplicaSelector replicaSelector = ReplicaSelector.ReplicaIdOf(PartitionSelector.PartitionIdOf(this.partitionSelector.ServiceName, partitionId), replicaId); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - faulting replica with id={1}", this.State.OperationId, replicaId); Task task = FaultAnalysisServiceUtility.RestartReplicaAsync(this.FabricClient, replicaSelector, CompletionMode.DoNotVerify, this.RequestTimeout, this.OperationTimeout, cancellationToken); tasks.Add(task); } await Task.WhenAll(tasks).ConfigureAwait(false); ActionTest.PerformInternalServiceFaultIfRequested(this.State.OperationId, serviceInternalFaultInfo, this.State, cancellationToken, true); TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - keeping partition in quorum loss for '{1}'", this.State.OperationId, state.Info.QuorumLossDuration); await Task.Delay(state.Info.QuorumLossDuration, cancellationToken).ConfigureAwait(false); TimeoutHelper timeoutHelper = new TimeoutHelper(this.OperationTimeout); bool conditionSatisfied = false; int quorumLossCheckRetries = FASConstants.QuorumLossCheckRetryCount; do { TestabilityTrace.TraceSource.WriteInfo(StepBase.TraceType, "{0} - checking PartitionStatus", this.State.OperationId); ServicePartitionList partitionsResult = await FabricClientRetryHelper.ExecuteFabricActionWithRetryAsync( () => this.FabricClient.QueryManager.GetPartitionListAsync( this.partitionSelector.ServiceName, null, this.RequestTimeout, cancellationToken), this.OperationTimeout, cancellationToken).ConfigureAwait(false); foreach (StatefulServicePartition partition in partitionsResult) { if (partition.PartitionInformation.Id == partitionId) { if (partition.PartitionStatus == ServicePartitionStatus.InQuorumLoss) { conditionSatisfied = true; break; } } } await AsyncWaiter.WaitAsync(TimeSpan.FromSeconds(5), cancellationToken).ConfigureAwait(false); }while (!conditionSatisfied && quorumLossCheckRetries-- > 0); if (!conditionSatisfied) { string error = string.Format(CultureInfo.InvariantCulture, "{0} - Service could not induce quorum loss for service '{1}', partition '{2}'. Please retry", this.State.OperationId, this.partitionSelector.ServiceName, partitionId); TestabilityTrace.TraceSource.WriteWarning(StepBase.TraceType, error); throw new FabricTransientException("The operation could not be performed, please retry", FabricErrorCode.NotReady); } await QuorumLossStepsFactory.RemoveUTAsync(this.FabricClient, this.State, this.RequestTimeout, this.OperationTimeout, cancellationToken); state.StateProgress.Push(StepStateNames.CompletedSuccessfully); return(state); }
public NodeCommandState(ActionType actionType, Guid operationId, NodeCommandSynchronizer nodeSync, ServiceInternalFaultInfo serviceInternalFaultInfo, string nodeName, BigInteger nodeInstanceId, int stopDurationInSeconds) : base(operationId, actionType, serviceInternalFaultInfo) { this.Info = new NodeCommandInfo(nodeName, nodeInstanceId, stopDurationInSeconds); this.NodeSync = nodeSync; this.RetryStepWithoutRollingBackOnFailure = true; }